The below comments should be installed only for the first time
#!pip install pandas
#!pip install numpy
#!pip install matplotlib
#pip install plotly
#pip install prophet
import datetime as dt
import pandas as pd
import numpy as np
import random as rd
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.graph_objs as go
import plotly.offline as py
import pickle
import gc
import os
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
from statsmodels.tsa.api import VAR
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
from statsmodels.tsa.api import ExponentialSmoothing
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from prophet import Prophet
from datetime import datetime
import statsmodels.api as sm
import pandas as pd
import plotly.express as px
# specify datatypes before loading the data will save you a ton of memory.
dtypes = {'id': np.uint32,
'store_nbr': np.uint8,
'item_nbr': np.uint32,
'unit_sales': np.float32,
'class': np.uint16,
'dcoilwtico':np.float16,
'transactions':np.uint16,
'cluster': np.uint32,
'onpromotion' : 'object'}
df_train = pd.read_csv(r"C:\users\u2195687\Input\train.csv",parse_dates=['date'],dtype = {
'onpromotion': 'object'
})
df_holiday = pd.read_csv(r"C:\users\u2195687\Input\holidays_events.csv",parse_dates=['date'])
df_items = pd.read_csv(r"C:\users\u2195687\Input\items.csv")
df_oil = pd.read_csv(r"C:\users\u2195687\Input\oil.csv",parse_dates=['date'])
df_stores = pd.read_csv(r"C:\users\u2195687\Input\stores.csv")
df_transactions=pd.read_csv(r"C:\users\u2195687\Input\transactions.csv",parse_dates =['date'])
df_train['Year'] = pd.DatetimeIndex(df_train['date']).year #— to get the year from the date.
df_train['Month'] = pd.DatetimeIndex(df_train['date']).month # To get month from the date.
df_train['Day'] = pd.DatetimeIndex(df_train['date']).day # To get month from the date.
df_train.shape
(125497040, 9)
df_train.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 2013-01-01 | 25 | 103665 | 7.0 | NaN | 2013 | 1 | 1 |
| 1 | 1 | 2013-01-01 | 25 | 105574 | 1.0 | NaN | 2013 | 1 | 1 |
| 2 | 2 | 2013-01-01 | 25 | 105575 | 2.0 | NaN | 2013 | 1 | 1 |
| 3 | 3 | 2013-01-01 | 25 | 108079 | 1.0 | NaN | 2013 | 1 | 1 |
| 4 | 4 | 2013-01-01 | 25 | 108701 | 1.0 | NaN | 2013 | 1 | 1 |
df_train.describe()
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | |
|---|---|---|---|---|---|---|---|
| count | 1.254970e+08 | 1.254970e+08 | 1.254970e+08 | 1.254970e+08 | 1.254970e+08 | 1.254970e+08 | 1.254970e+08 |
| mean | 6.274852e+07 | 2.746458e+01 | 9.727692e+05 | 8.554865e+00 | 2.015223e+03 | 6.334971e+00 | 1.560188e+01 |
| std | 3.622788e+07 | 1.633051e+01 | 5.205336e+05 | 2.360515e+01 | 1.299140e+00 | 3.392866e+00 | 8.816411e+00 |
| min | 0.000000e+00 | 1.000000e+00 | 9.699500e+04 | -1.537200e+04 | 2.013000e+03 | 1.000000e+00 | 1.000000e+00 |
| 25% | 3.137426e+07 | 1.200000e+01 | 5.223830e+05 | 2.000000e+00 | 2.014000e+03 | 3.000000e+00 | 8.000000e+00 |
| 50% | 6.274852e+07 | 2.800000e+01 | 9.595000e+05 | 4.000000e+00 | 2.015000e+03 | 6.000000e+00 | 1.500000e+01 |
| 75% | 9.412278e+07 | 4.300000e+01 | 1.354380e+06 | 9.000000e+00 | 2.016000e+03 | 9.000000e+00 | 2.300000e+01 |
| max | 1.254970e+08 | 5.400000e+01 | 2.127114e+06 | 8.944000e+04 | 2.017000e+03 | 1.200000e+01 | 3.100000e+01 |
df_transactions.head()
| date | store_nbr | transactions | |
|---|---|---|---|
| 0 | 2013-01-01 | 25 | 770 |
| 1 | 2013-01-02 | 1 | 2111 |
| 2 | 2013-01-02 | 2 | 2358 |
| 3 | 2013-01-02 | 3 | 3487 |
| 4 | 2013-01-02 | 4 | 1922 |
i) Transactions dataset has the store number and the transactions details.
ii) We already know that Store number is also available in the train dataset as well.
iii) Transactions denotes the total number of transactions happened on the particular day with respect to each store.
df_holiday.head()
| date | type | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|
| 0 | 2012-03-02 | Holiday | Local | Manta | Fundacion de Manta | False |
| 1 | 2012-04-01 | Holiday | Regional | Cotopaxi | Provincializacion de Cotopaxi | False |
| 2 | 2012-04-12 | Holiday | Local | Cuenca | Fundacion de Cuenca | False |
| 3 | 2012-04-14 | Holiday | Local | Libertad | Cantonizacion de Libertad | False |
| 4 | 2012-04-21 | Holiday | Local | Riobamba | Cantonizacion de Riobamba | False |
df_items.head()
| item_nbr | family | class | perishable | |
|---|---|---|---|---|
| 0 | 96995 | GROCERY I | 1093 | 0 |
| 1 | 99197 | GROCERY I | 1067 | 0 |
| 2 | 103501 | CLEANING | 3008 | 0 |
| 3 | 103520 | GROCERY I | 1028 | 0 |
| 4 | 103665 | BREAD/BAKERY | 2712 | 1 |
i) Item dataset has the item number, family, class and perishable( likely to decay)
df_oil.head()
| date | dcoilwtico | |
|---|---|---|
| 0 | 2013-01-01 | NaN |
| 1 | 2013-01-02 | 93.14 |
| 2 | 2013-01-03 | 92.97 |
| 3 | 2013-01-04 | 93.12 |
| 4 | 2013-01-07 | 93.20 |
df_stores.head()
| store_nbr | city | state | type | cluster | |
|---|---|---|---|---|---|
| 0 | 1 | Quito | Pichincha | D | 13 |
| 1 | 2 | Quito | Pichincha | D | 13 |
| 2 | 3 | Quito | Pichincha | D | 8 |
| 3 | 4 | Quito | Pichincha | D | 9 |
| 4 | 5 | Santo Domingo | Santo Domingo de los Tsachilas | D | 4 |
Using the above details , we need to find the Unit_sales prediction for the future year
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 125497040 entries, 0 to 125497039 Data columns (total 9 columns): # Column Dtype --- ------ ----- 0 id int64 1 date datetime64[ns] 2 store_nbr int64 3 item_nbr int64 4 unit_sales float64 5 onpromotion object 6 Year int64 7 Month int64 8 Day int64 dtypes: datetime64[ns](1), float64(1), int64(6), object(1) memory usage: 8.4+ GB
df_train.shape
(125497040, 9)
It has 12 million records which is very huge. so, we can consider the sales only for some records.
After visualisation , we can decide the factors to select records.
df_holiday.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 350 entries, 0 to 349 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 350 non-null datetime64[ns] 1 type 350 non-null object 2 locale 350 non-null object 3 locale_name 350 non-null object 4 description 350 non-null object 5 transferred 350 non-null bool dtypes: bool(1), datetime64[ns](1), object(4) memory usage: 14.1+ KB
df_items.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4100 entries, 0 to 4099 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 item_nbr 4100 non-null int64 1 family 4100 non-null object 2 class 4100 non-null int64 3 perishable 4100 non-null int64 dtypes: int64(3), object(1) memory usage: 128.2+ KB
df_oil.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1218 entries, 0 to 1217 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 1218 non-null datetime64[ns] 1 dcoilwtico 1175 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 19.2 KB
df_stores.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 54 entries, 0 to 53 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 store_nbr 54 non-null int64 1 city 54 non-null object 2 state 54 non-null object 3 type 54 non-null object 4 cluster 54 non-null int64 dtypes: int64(2), object(3) memory usage: 2.2+ KB
df_transactions.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 83488 entries, 0 to 83487 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 83488 non-null datetime64[ns] 1 store_nbr 83488 non-null int64 2 transactions 83488 non-null int64 dtypes: datetime64[ns](1), int64(2) memory usage: 1.9 MB
Date fields are of datetime type as we specify the dtype while importing. we can also create the day , month and year feature from the date feature
plt.figure(figsize=(8,4))
df_train['Year'].value_counts(sort = False).plot.bar(y=['unit_sales'],color=['green'],edgecolor='blue',xlabel='Year',
ylabel='Unit Sales')
<AxesSubplot:xlabel='Year', ylabel='Unit Sales'>
2016 have the largest number of sales.
plt.figure(figsize=(8,4))
df_train['Month'].value_counts(sort = False).plot.bar(y=['unit_sales'],color=['Purple'],edgecolor='red',
xlabel = ' Month ',ylabel='Unit Sales')
<AxesSubplot:xlabel=' Month ', ylabel='Unit Sales'>
The distributions are more or less equal with respect to the months
plt.figure(figsize=(10,6))
df_train['Day'].value_counts(sort = False).plot.bar(y=['unit_sales'],color=['cyan'],edgecolor='green',
xlabel = ' Day ',ylabel='Unit Sales')
<AxesSubplot:xlabel=' Day ', ylabel='Unit Sales'>
plt.figure(figsize=(20,8))
df_train['store_nbr'].sort_values().value_counts(sort = False).plot.bar(figsize=(9, 7))
#value_counts sorts the values by descending frequencies by default. Disable sorting using sort=False:
plt.xlabel("Store Number", labelpad=14)
plt.ylabel("Unit_Sales", labelpad=14)
plt.title("Stores vs Sales Comparsion", y=1.02);
df_train.boxplot(column =['unit_sales'], grid = False,color='red')
<AxesSubplot:>
df_train['onpromotion'].value_counts(dropna=False).plot.bar(color='aqua',xlabel='onpromotion',ylabel='count')
<AxesSubplot:xlabel='onpromotion', ylabel='count'>
#transactions
# month over month sales
df_transactions['date']=pd.to_datetime(df_transactions['date'])
temp=df_transactions.groupby(['date']).aggregate({'store_nbr':'count','transactions':np.sum})
temp=temp.reset_index()
temp_2013=temp[temp['date'].dt.year==2013].reset_index(drop=True)
temp_2014=temp[temp['date'].dt.year==2014].reset_index(drop=True)
temp_2015=temp[temp['date'].dt.year==2015].reset_index(drop=True)
temp_2016=temp[temp['date'].dt.year==2016].reset_index(drop=True)
temp_2017=temp[temp['date'].dt.year==2017].reset_index(drop=True)
sns.set(style="whitegrid", color_codes=True)
plt.figure(figsize=(12,8))
plt.subplot(211)
plt.plot(temp_2013['date'],temp_2013.iloc[:,1],label="2013")
plt.plot(temp_2014['date'],temp_2014.iloc[:,1],label="2014")
plt.plot(temp_2015['date'],temp_2015.iloc[:,1],label="2015")
plt.plot(temp_2016['date'],temp_2016.iloc[:,1],label="2016")
plt.plot(temp_2017['date'],temp_2017.iloc[:,1],label="2017")
plt.ylabel('Number of stores open', fontsize=12)
plt.xlabel('Time', fontsize=12)
plt.title('Number of stores open', fontsize=15)
plt.xticks(rotation='vertical')
plt.legend(['2013', '2014', '2015', '2016'], loc='lower right')
<matplotlib.legend.Legend at 0x1ca00a647c0>
plt.figure(figsize=(12,6))
plt.plot(df_transactions.rolling(window=30,center=False).mean(),label='Rolling Mean');
plt.plot(df_transactions.rolling(window=30,center=False).std(),label='Rolling sd');
plt.legend();
plt.style.use('seaborn-white')
plt.figure(figsize=(12,6))
plt.plot(df_transactions.date.values, df_transactions.transactions.values, color='darkblue')
plt.ylim(-50, 10000)
plt.title("Distribution of transactions per day from 2013 till 2017")
plt.ylabel('transactions per day', fontsize= 16)
plt.xlabel('Date', fontsize= 16)
plt.show()
df_items['perishable'].value_counts(sort = False).plot.bar(color=['blue'],edgecolor='green',
xlabel='Perishable Items',ylabel='count')
<AxesSubplot:xlabel='Perishable Items', ylabel='count'>
figsize=(24,40)
df_items['family'].value_counts(sort = False).plot.bar(color=['red'],edgecolor='purple',
xlabel='Item Family',ylabel='count')
<AxesSubplot:xlabel='Item Family', ylabel='count'>
plt.style.use('seaborn-white')
nbr_cluster = df_stores.groupby(['store_nbr','cluster']).size()
nbr_cluster.unstack().iloc[neworder].plot(kind='bar',stacked=True, colormap= 'tab20', figsize=(16,5), grid=False)
plt.title('Store numbers and the clusters they are assigned to', fontsize=14)
plt.ylabel('Clusters')
plt.xlabel('Store number')
plt.show()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [39], in <cell line: 3>() 1 plt.style.use('seaborn-white') 2 nbr_cluster = df_stores.groupby(['store_nbr','cluster']).size() ----> 3 nbr_cluster.unstack().iloc[neworder].plot(kind='bar',stacked=True, colormap= 'tab20', figsize=(16,5), grid=False) 4 plt.title('Store numbers and the clusters they are assigned to', fontsize=14) 5 plt.ylabel('Clusters') NameError: name 'neworder' is not defined
plt.style.use('seaborn-white')
city_cluster = df_stores.groupby(['city','type']).store_nbr.size()
city_cluster.unstack().plot(kind='bar',stacked=True, colormap= 'viridis', figsize=(14,7), grid=False)
plt.title('Stacked Barplot of Store types opened for each city')
plt.ylabel('Count of stores for a particular city')
plt.show()
#df_oil['Year'] = pd.DatetimeIndex(df_oil['date']).year #— to get the year from the date.
#df_oil['Month'] = pd.DatetimeIndex(df_oil['date']).month # To get month from the date.
#df_oil['Day'] = pd.DatetimeIndex(df_oil['date']).day # To get month from the date.
trace = go.Scatter(
name='Oil prices',
x=df_oil['date'],
y=df_oil['dcoilwtico'].dropna(),
mode='lines',
line=dict(color='rgb(25, 15, 200, 0.5)'),
#fillcolor='rgba(68, 68, 68, 0.3)',
fillcolor='rgba(0, 1, 219, 0.4)',
fill='tonexty' )
data = [trace]
layout = go.Layout(
yaxis=dict(title='Daily Oil price'),
title='Daily oil prices from Jan 2013 till July 2017',
showlegend = False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='pandas-time-series-error-bars')
#df_holiday['Year'] = pd.DatetimeIndex(df_holiday['date']).year #— to get the year from the date.
#df_holiday['Month'] = pd.DatetimeIndex(df_holiday['date']).month # To get month from the date.
#df_holiday['Day'] = pd.DatetimeIndex(df_holiday['date']).day # To get month from the date.
plt.style.use('seaborn-white')
holiday_local_type = df_holiday.groupby(['locale_name', 'type']).size()
holiday_local_type.unstack().plot(kind='bar',stacked=True, colormap= 'magma_r', figsize=(12,6), grid=False)
plt.title('Stacked Barplot of locale name against event type')
plt.ylabel('Count of entries')
plt.show()
Lets Extract 3 years data and keep it in a dataframe and save it in a csv
year_nbr = [2015,2016,2017]
df_trainyr = df_train[df_train['Year'].isin(year_nbr)]
print(df_trainyr.shape)
(86902776, 9)
print(df_train.shape)
(125497040, 9)
df_trainyr.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | |
|---|---|---|---|---|---|---|---|---|---|
| 38594264 | 38594264 | 2015-01-01 | 25 | 103665 | 12.0 | False | 2015 | 1 | 1 |
| 38594265 | 38594265 | 2015-01-01 | 25 | 105575 | 23.0 | False | 2015 | 1 | 1 |
| 38594266 | 38594266 | 2015-01-01 | 25 | 108634 | 1.0 | False | 2015 | 1 | 1 |
| 38594267 | 38594267 | 2015-01-01 | 25 | 108698 | 6.0 | False | 2015 | 1 | 1 |
| 38594268 | 38594268 | 2015-01-01 | 25 | 108786 | 6.0 | False | 2015 | 1 | 1 |
item_num = df_items.loc[df_items['family']=='EGGS']['item_nbr'].tolist()
print(item_num)
[108833, 127547, 158680, 208384, 208386, 227111, 258268, 357961, 357962, 357963, 376427, 407499, 407500, 457424, 457425, 476261, 507478, 507479, 525902, 557256, 557257, 575875, 575876, 617092, 617096, 617103, 617113, 679926, 902198, 902200, 902202, 1082907, 1089163, 1148548, 1162755, 1162759, 1167614, 1259006, 1259007, 1320701, 1974848]
df_egg = df_trainyr[df_trainyr['item_nbr'].isin(item_num)]
print(df_egg.shape)
(967504, 9)
df_egg['item_nbr'].value_counts(sort = False).plot.bar(color=['violet'],edgecolor='green',
xlabel='Egg Items',ylabel='count',figsize = (20,10))
plt.xticks(rotation=45)
plt.tight_layout()
df_eggmin = df_egg[df_egg['item_nbr']==1974848]
df_eggmax = df_egg[df_egg['item_nbr']==208384]
print(df_eggmin.shape)
print(df_eggmax.shape)
(13166, 9) (45526, 9)
so let us proceed with the dataframe df_eggmin
df_eggmax.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 45526 entries, 38594342 to 125493975 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 45526 non-null int64 1 date 45526 non-null datetime64[ns] 2 store_nbr 45526 non-null int64 3 item_nbr 45526 non-null int64 4 unit_sales 45526 non-null float64 5 onpromotion 45526 non-null object 6 Year 45526 non-null int64 7 Month 45526 non-null int64 8 Day 45526 non-null int64 dtypes: datetime64[ns](1), float64(1), int64(6), object(1) memory usage: 3.5+ MB
df_eggmax.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | |
|---|---|---|---|---|---|---|---|---|---|
| 38594342 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 |
| 38595506 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 |
| 38596664 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 |
| 38598159 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 |
| 38599830 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 |
df_eggmax['unit_sales'].plot()
<AxesSubplot:>
# Left Join - Train & Items
train_subset = pd.merge(df_eggmax, df_items, on = 'item_nbr', how = 'left')
train_subset.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | family | class | perishable | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 | EGGS | 2502 | 1 |
| 1 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 |
| 2 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 |
| 3 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 |
| 4 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 |
# Left Join - merged & Store
train_subset = pd.merge(train_subset, df_stores, on = 'store_nbr', how = 'left')
train_subset.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | family | class | perishable | city | state | type | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 | EGGS | 2502 | 1 | Salinas | Santa Elena | D | 1 |
| 1 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 |
| 2 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 |
| 3 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 8 |
| 4 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 9 |
# Left Join - merged & Oil
train_subset = pd.merge(train_subset, df_oil, on = 'date', how = 'left')
train_subset.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | family | class | perishable | city | state | type | cluster | dcoilwtico | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 | EGGS | 2502 | 1 | Salinas | Santa Elena | D | 1 | NaN |
| 1 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 | 52.72 |
| 2 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 | 52.72 |
| 3 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 8 | 52.72 |
| 4 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 9 | 52.72 |
# Left Join - Train & Holiday
train_subset = pd.merge(train_subset, df_holiday, on = 'date', how = 'left')
train_subset.head()
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | family | ... | city | state | type_x | cluster | dcoilwtico | type_y | locale | locale_name | description | transferred | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 | EGGS | ... | Salinas | Santa Elena | D | 1 | NaN | Holiday | National | Ecuador | Primer dia del ano | False |
| 1 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 | EGGS | ... | Quito | Pichincha | D | 13 | 52.72 | Bridge | National | Ecuador | Puente Primer dia del ano | False |
| 2 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 | EGGS | ... | Quito | Pichincha | D | 13 | 52.72 | Bridge | National | Ecuador | Puente Primer dia del ano | False |
| 3 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 | EGGS | ... | Quito | Pichincha | D | 8 | 52.72 | Bridge | National | Ecuador | Puente Primer dia del ano | False |
| 4 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 | EGGS | ... | Quito | Pichincha | D | 9 | 52.72 | Bridge | National | Ecuador | Puente Primer dia del ano | False |
5 rows × 22 columns
train_subset.columns
Index(['id', 'date', 'store_nbr', 'item_nbr', 'unit_sales', 'onpromotion',
'Year', 'Month', 'Day', 'family', 'class', 'perishable', 'city',
'state', 'type_x', 'cluster', 'dcoilwtico', 'type_y', 'locale',
'locale_name', 'description', 'transferred'],
dtype='object')
train_subset = train_subset.drop(['locale', 'locale_name','description','transferred'], axis=1)
train_subset = train_subset.rename(columns={"type_y": "day_type", "type_x": "type","dcoilwtico":"oil_price"})
train_subset.describe()
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 4.645800e+04 | 46458.000000 | 46458.0 | 46458.000000 | 46458.000000 | 46458.000000 | 46458.000000 | 46458.0 | 46458.0 | 46458.000000 | 31363.000000 |
| mean | 7.927775e+07 | 27.347045 | 208384.0 | 15.937557 | 2015.879762 | 6.010052 | 15.632636 | 2502.0 | 1.0 | 8.479121 | 46.738771 |
| std | 2.566524e+07 | 15.696945 | 0.0 | 20.012322 | 0.774293 | 3.301916 | 8.799233 | 0.0 | 0.0 | 4.623320 | 6.728534 |
| min | 3.859434e+07 | 1.000000 | 208384.0 | 1.000000 | 2015.000000 | 1.000000 | 1.000000 | 2502.0 | 1.0 | 1.000000 | 26.190000 |
| 25% | 5.617923e+07 | 14.000000 | 208384.0 | 4.000000 | 2015.000000 | 3.000000 | 8.000000 | 2502.0 | 1.0 | 4.000000 | 43.870000 |
| 50% | 7.832521e+07 | 28.000000 | 208384.0 | 9.000000 | 2016.000000 | 6.000000 | 15.000000 | 2502.0 | 1.0 | 9.000000 | 47.280000 |
| 75% | 1.013656e+08 | 41.000000 | 208384.0 | 21.000000 | 2016.000000 | 9.000000 | 23.000000 | 2502.0 | 1.0 | 13.000000 | 50.610000 |
| max | 1.254940e+08 | 54.000000 | 208384.0 | 578.000000 | 2017.000000 | 12.000000 | 31.000000 | 2502.0 | 1.0 | 17.000000 | 61.360000 |
train_subset.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 46458 entries, 0 to 46457 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 46458 non-null int64 1 date 46458 non-null datetime64[ns] 2 store_nbr 46458 non-null int64 3 item_nbr 46458 non-null int64 4 unit_sales 46458 non-null float64 5 onpromotion 46458 non-null object 6 Year 46458 non-null int64 7 Month 46458 non-null int64 8 Day 46458 non-null int64 9 family 46458 non-null object 10 class 46458 non-null int64 11 perishable 46458 non-null int64 12 city 46458 non-null object 13 state 46458 non-null object 14 type 46458 non-null object 15 cluster 46458 non-null int64 16 oil_price 31363 non-null float64 17 day_type 7890 non-null object dtypes: datetime64[ns](1), float64(2), int64(9), object(6) memory usage: 6.7+ MB
train_subset
| id | date | store_nbr | item_nbr | unit_sales | onpromotion | Year | Month | Day | family | class | perishable | city | state | type | cluster | oil_price | day_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 38594342 | 2015-01-01 | 25 | 208384 | 37.0 | False | 2015 | 1 | 1 | EGGS | 2502 | 1 | Salinas | Santa Elena | D | 1 | NaN | Holiday |
| 1 | 38595506 | 2015-01-02 | 1 | 208384 | 27.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 | 52.72 | Bridge |
| 2 | 38596664 | 2015-01-02 | 2 | 208384 | 22.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 13 | 52.72 | Bridge |
| 3 | 38598159 | 2015-01-02 | 3 | 208384 | 92.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 8 | 52.72 | Bridge |
| 4 | 38599830 | 2015-01-02 | 4 | 208384 | 14.0 | True | 2015 | 1 | 2 | EGGS | 2502 | 1 | Quito | Pichincha | D | 9 | 52.72 | Bridge |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 46453 | 125484394 | 2017-08-15 | 49 | 208384 | 26.0 | False | 2017 | 8 | 15 | EGGS | 2502 | 1 | Quito | Pichincha | A | 11 | 47.57 | Holiday |
| 46454 | 125487020 | 2017-08-15 | 50 | 208384 | 2.0 | False | 2017 | 8 | 15 | EGGS | 2502 | 1 | Ambato | Tungurahua | A | 14 | 47.57 | Holiday |
| 46455 | 125489400 | 2017-08-15 | 51 | 208384 | 28.0 | False | 2017 | 8 | 15 | EGGS | 2502 | 1 | Guayaquil | Guayas | A | 17 | 47.57 | Holiday |
| 46456 | 125491619 | 2017-08-15 | 52 | 208384 | 28.0 | False | 2017 | 8 | 15 | EGGS | 2502 | 1 | Manta | Manabi | A | 11 | 47.57 | Holiday |
| 46457 | 125493975 | 2017-08-15 | 53 | 208384 | 8.0 | False | 2017 | 8 | 15 | EGGS | 2502 | 1 | Manta | Manabi | D | 13 | 47.57 | Holiday |
46458 rows × 18 columns
train_subsetbkp = train_subset.copy()
train_subset = train_subset.set_index('date')
train_subset_d = train_subset.resample('B').mean()
train_subset_d.shape
(684, 11)
train_subset_with = train_subset.resample('D').mean()
train_subset_with.shape
#Daily - with holidays- facing lot of Nans
(958, 11)
train_subset_dbkp = train_subset_d.copy()
#Back up in another dataframe
train_subset_d
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2015-01-01 | 3.859434e+07 | 25.000000 | 208384.0 | 37.000000 | 2015.0 | 1.0 | 1.000000 | 2502.0 | 1.0 | 1.000000 | NaN |
| 2015-01-02 | 3.868373e+07 | 26.661417 | 208384.0 | 25.283465 | 2015.0 | 1.0 | 2.968504 | 2502.0 | 1.0 | 8.881890 | 52.72 |
| 2015-01-05 | 3.880926e+07 | 28.600000 | 208384.0 | 23.875000 | 2015.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.325000 | 50.05 |
| 2015-01-06 | 3.886649e+07 | 27.452381 | 208384.0 | 17.809524 | 2015.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.476190 | 47.98 |
| 2015-01-07 | 3.892401e+07 | 27.139535 | 208384.0 | 17.488372 | 2015.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.488372 | 48.69 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 |
684 rows × 11 columns
Basic Analysis
print("Rows :",train_subset_d.shape[0])
print("Columns :",train_subset_d.shape[1])
print("\n Features :",train_subset_d.columns.tolist())
print("\n Missing Values :",train_subset_d.isnull().sum())
print("\n Unique Values :",train_subset_d.nunique())
Rows : 684 Columns : 11 Features : ['id', 'store_nbr', 'item_nbr', 'unit_sales', 'Year', 'Month', 'Day', 'class', 'perishable', 'cluster', 'oil_price'] Missing Values : id 0 store_nbr 0 item_nbr 0 unit_sales 0 Year 0 Month 0 Day 0 class 0 perishable 0 cluster 0 oil_price 25 dtype: int64 Unique Values : id 684 store_nbr 515 item_nbr 1 unit_sales 628 Year 4 Month 21 Day 154 class 1 perishable 1 cluster 376 oil_price 576 dtype: int64
Its evident that the column item_nbr ,family , class and perishable have only one value so we can drop them. Also, the missing values are available in the fields oil_price and day_type
Indexing - this allows Querying of the date easily and also data retrieval works fine here.At the same time, time series needs date as the index
Querying the date (We can pass the indexed value in the loc function for effective querying)
train_subset_d.loc['2017']
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2017-01-02 | 1.017459e+08 | 27.440000 | 208384.0 | 22.420000 | 2017.0 | 1.0 | 2.000000 | 2502.0 | 1.0 | 8.520000 | NaN |
| 2017-01-03 | 1.018579e+08 | 27.125000 | 208384.0 | 18.500000 | 2017.0 | 1.0 | 3.000000 | 2502.0 | 1.0 | 8.750000 | 52.36 |
| 2017-01-04 | 1.019669e+08 | 27.500000 | 208384.0 | 14.600000 | 2017.0 | 1.0 | 4.000000 | 2502.0 | 1.0 | 8.520000 | 53.26 |
| 2017-01-05 | 1.020727e+08 | 28.276596 | 208384.0 | 12.361702 | 2017.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.468085 | 53.77 |
| 2017-01-06 | 1.022850e+08 | 27.225166 | 208384.0 | 15.483444 | 2017.0 | 1.0 | 7.033113 | 2502.0 | 1.0 | 8.437086 | 53.98 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 |
162 rows × 11 columns
train_subset_d.loc['2016':'2017']
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2016-01-01 | 6.656135e+07 | 26.960000 | 208384.0 | 21.360000 | 2016.0 | 1.0 | 2.520000 | 2502.0 | 1.0 | 8.480000 | NaN |
| 2016-01-04 | 6.670822e+07 | 26.775510 | 208384.0 | 19.551020 | 2016.0 | 1.0 | 4.000000 | 2502.0 | 1.0 | 8.285714 | 36.81 |
| 2016-01-05 | 6.680518e+07 | 27.666667 | 208384.0 | 15.755556 | 2016.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 7.822222 | 35.97 |
| 2016-01-06 | 6.689838e+07 | 27.100000 | 208384.0 | 14.140000 | 2016.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.680000 | 33.97 |
| 2016-01-07 | 6.699207e+07 | 28.204545 | 208384.0 | 11.545455 | 2016.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.545455 | 33.29 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 |
423 rows × 11 columns
train_subset_data = train_subset_d['unit_sales']
train_subset_data.head()
date 2015-01-01 37.000000 2015-01-02 25.283465 2015-01-05 23.875000 2015-01-06 17.809524 2015-01-07 17.488372 Freq: B, Name: unit_sales, dtype: float64
train_subset_data.plot(grid=True,xlabel='Date',ylabel='Unit_Sales')
<AxesSubplot:xlabel='Date', ylabel='Unit_Sales'>
Year 2015 Plot
train_2015 = train_subset['2015']
sales_train_2015 = train_2015['unit_sales']
sales_train_2015.plot(grid=True)
<AxesSubplot:xlabel='date'>
Year 2016 Plot
train_2016 = train_subset['2016']
sales_train_2016 = train_2016['unit_sales']
sales_train_2016.plot(grid=True)
<AxesSubplot:xlabel='date'>
Year 2017 Plot
train_2017 = train_subset['2017']
sales_train_2017 = train_2017['unit_sales']
sales_train_2017.plot(grid=True)
<AxesSubplot:xlabel='date'>
Histogram
train_subset_d[['unit_sales']].hist()
array([[<AxesSubplot:title={'center':'unit_sales'}>]], dtype=object)
train_subset_d[['unit_sales']].plot(kind='density')
<AxesSubplot:ylabel='Density'>
Lag Plot: They will show the relationship between the current period and the lag period. Linear shape suggest that a Autoregressive can be applied. This is used to check linearity , randomness , Outliers and check autocorrelation as well
First Order
pd.plotting.lag_plot(train_subset_d['oil_price'],lag=1)
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 1)'>
pd.plotting.lag_plot(train_subset_d['oil_price'],lag=30)
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 30)'>
pd.plotting.lag_plot(train_subset_d['oil_price'],lag=365)
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 365)'>
# Train Dataset
train_subset_d.isnull().sum()
id 0 store_nbr 0 item_nbr 0 unit_sales 0 Year 0 Month 0 Day 0 class 0 perishable 0 cluster 0 oil_price 25 dtype: int64
Displaying the affected records
train_subset_d.query('oil_price != oil_price')
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2015-01-01 | 3.859434e+07 | 25.000000 | 208384.0 | 37.000000 | 2015.0 | 1.0 | 1.000000 | 2502.0 | 1.0 | 1.000000 | NaN |
| 2015-01-19 | 3.962400e+07 | 26.095238 | 208384.0 | 17.285714 | 2015.0 | 1.0 | 19.000000 | 2502.0 | 1.0 | 9.357143 | NaN |
| 2015-02-16 | 4.127830e+07 | 27.600000 | 208384.0 | 17.222222 | 2015.0 | 2.0 | 16.000000 | 2502.0 | 1.0 | 8.311111 | NaN |
| 2015-04-03 | 4.417240e+07 | 26.661765 | 208384.0 | 19.029412 | 2015.0 | 4.0 | 3.985294 | 2502.0 | 1.0 | 8.779412 | NaN |
| 2015-05-25 | 4.743724e+07 | 27.173913 | 208384.0 | 15.369565 | 2015.0 | 5.0 | 25.000000 | 2502.0 | 1.0 | 8.347826 | NaN |
| 2015-07-03 | 5.054952e+07 | 27.163043 | 208384.0 | 17.206522 | 2015.0 | 7.0 | 3.798913 | 2502.0 | 1.0 | 8.326087 | NaN |
| 2015-09-07 | 5.596668e+07 | 27.411765 | 208384.0 | 12.803922 | 2015.0 | 9.0 | 7.000000 | 2502.0 | 1.0 | 8.333333 | NaN |
| 2015-11-26 | 6.317249e+07 | 29.000000 | 208384.0 | 9.333333 | 2015.0 | 11.0 | 26.000000 | 2502.0 | 1.0 | 8.222222 | NaN |
| 2015-12-25 | 6.597251e+07 | 27.177083 | 208384.0 | 15.281250 | 2015.0 | 12.0 | 26.458333 | 2502.0 | 1.0 | 8.437500 | NaN |
| 2016-01-01 | 6.656135e+07 | 26.960000 | 208384.0 | 21.360000 | 2016.0 | 1.0 | 2.520000 | 2502.0 | 1.0 | 8.480000 | NaN |
| 2016-01-18 | 6.802387e+07 | 27.541667 | 208384.0 | 14.520833 | 2016.0 | 1.0 | 18.000000 | 2502.0 | 1.0 | 8.166667 | NaN |
| 2016-02-15 | 7.065511e+07 | 27.653061 | 208384.0 | 15.102041 | 2016.0 | 2.0 | 15.000000 | 2502.0 | 1.0 | 8.204082 | NaN |
| 2016-03-25 | 7.444512e+07 | 27.132450 | 208384.0 | 15.211921 | 2016.0 | 3.0 | 25.993377 | 2502.0 | 1.0 | 8.490066 | NaN |
| 2016-05-30 | 8.068726e+07 | 27.480000 | 208384.0 | 14.160000 | 2016.0 | 5.0 | 30.000000 | 2502.0 | 1.0 | 8.440000 | NaN |
| 2016-07-04 | 8.401674e+07 | 26.959184 | 208384.0 | 15.795918 | 2016.0 | 7.0 | 4.000000 | 2502.0 | 1.0 | 8.734694 | NaN |
| 2016-09-05 | 9.011011e+07 | 27.836735 | 208384.0 | 14.326531 | 2016.0 | 9.0 | 5.000000 | 2502.0 | 1.0 | 8.408163 | NaN |
| 2016-11-24 | 9.787039e+07 | 28.125000 | 208384.0 | 9.041667 | 2016.0 | 11.0 | 24.000000 | 2502.0 | 1.0 | 8.000000 | NaN |
| 2016-12-26 | 1.010939e+08 | 27.037736 | 208384.0 | 15.396226 | 2016.0 | 12.0 | 26.000000 | 2502.0 | 1.0 | 8.433962 | NaN |
| 2017-01-02 | 1.017459e+08 | 27.440000 | 208384.0 | 22.420000 | 2017.0 | 1.0 | 2.000000 | 2502.0 | 1.0 | 8.520000 | NaN |
| 2017-01-16 | 1.032213e+08 | 27.117647 | 208384.0 | 12.137255 | 2017.0 | 1.0 | 16.000000 | 2502.0 | 1.0 | 8.411765 | NaN |
| 2017-02-20 | 1.068718e+08 | 26.530612 | 208384.0 | 25.183673 | 2017.0 | 2.0 | 20.000000 | 2502.0 | 1.0 | 8.571429 | NaN |
| 2017-04-14 | 1.125060e+08 | 27.421320 | 208384.0 | 14.908629 | 2017.0 | 4.0 | 14.751269 | 2502.0 | 1.0 | 8.573604 | NaN |
| 2017-05-29 | 1.172143e+08 | 27.720000 | 208384.0 | 12.920000 | 2017.0 | 5.0 | 29.000000 | 2502.0 | 1.0 | 8.600000 | NaN |
| 2017-07-03 | 1.209303e+08 | 27.115385 | 208384.0 | 15.596154 | 2017.0 | 7.0 | 3.000000 | 2502.0 | 1.0 | 8.557692 | NaN |
| 2017-07-04 | 1.210397e+08 | 27.860000 | 208384.0 | 12.660000 | 2017.0 | 7.0 | 4.000000 | 2502.0 | 1.0 | 8.460000 | NaN |
pd.plotting.lag_plot(train_subset['oil_price'],lag=24)
<AxesSubplot:xlabel='y(t)', ylabel='y(t + 24)'>
As per the above plot, there is a lineraity with the previous lag for the current lag in oil_price so we can fill the values # with forward fill(take prev values and fill next value). we can also apply bfill inplace of ffill to fill backwards (takes future values and take). but backward is not preferrable because furure value will not be available in some business scenario.
train_subset_d['oil_price'] = train_subset_d['oil_price'].fillna(method='ffill')
train_subset_d.isnull().sum()
id 0 store_nbr 0 item_nbr 0 unit_sales 0 Year 0 Month 0 Day 0 class 0 perishable 0 cluster 0 oil_price 1 dtype: int64
# Outlier Analysis
fig, axs = plt.subplots(1, figsize = (5,5))
plt1 = sns.boxplot(train_subset_d['unit_sales'],color='red')
#plt2 = sns.boxplot(advertising['Newspaper'], ax = axs[1])
#plt3 = sns.boxplot(advertising['Radio'], ax = axs[2])
plt.tight_layout()
unit_sales = train_subset_d["unit_sales"].values
#print(unit_sales.shape)
plt.scatter(x = range(unit_sales.shape[0]), y = np.sort(unit_sales))
<matplotlib.collections.PathCollection at 0x16a02f8f8e0>
train_subset_z = train_subset_d.copy()
train_subset_z['z_score'] = (train_subset_z['unit_sales'] - train_subset_z['unit_sales'].mean())/train_subset_z['unit_sales'].std(ddof=0)
train_subset_z['z_score'].plot()
<AxesSubplot:xlabel='date'>
train_subset_z[train_subset_z['z_score'] > 4]
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | z_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||
| 2015-01-01 | 3.859434e+07 | 25.000000 | 208384.0 | 37.000000 | 2015.0 | 1.0 | 1.000000 | 2502.0 | 1.0 | 1.000000 | NaN | 4.129508 |
| 2016-11-11 | 9.669595e+07 | 27.057592 | 208384.0 | 38.000000 | 2016.0 | 11.0 | 12.036649 | 2502.0 | 1.0 | 8.596859 | 43.39 | 4.316651 |
| 2017-02-28 | 1.077028e+08 | 27.297872 | 208384.0 | 37.191489 | 2017.0 | 2.0 | 28.000000 | 2502.0 | 1.0 | 8.638298 | 54.00 | 4.165344 |
| 2017-03-03 | 1.081367e+08 | 27.037736 | 208384.0 | 37.232704 | 2017.0 | 3.0 | 4.000000 | 2502.0 | 1.0 | 8.433962 | 53.33 | 4.173057 |
| 2017-06-02 | 1.177486e+08 | 27.362500 | 208384.0 | 36.656250 | 2017.0 | 6.0 | 3.000000 | 2502.0 | 1.0 | 8.487500 | 47.68 | 4.065177 |
| 2017-06-07 | 1.181845e+08 | 27.666667 | 208384.0 | 39.708333 | 2017.0 | 6.0 | 7.000000 | 2502.0 | 1.0 | 8.291667 | 45.80 | 4.636354 |
| 2017-06-09 | 1.184937e+08 | 27.348387 | 208384.0 | 49.012903 | 2017.0 | 6.0 | 10.000000 | 2502.0 | 1.0 | 8.593548 | 45.82 | 6.377639 |
| 2017-06-12 | 1.187134e+08 | 27.603774 | 208384.0 | 37.188679 | 2017.0 | 6.0 | 12.000000 | 2502.0 | 1.0 | 8.509434 | 46.10 | 4.164818 |
| 2017-06-14 | 1.189160e+08 | 27.607843 | 208384.0 | 40.431373 | 2017.0 | 6.0 | 14.000000 | 2502.0 | 1.0 | 8.352941 | 44.79 | 4.771665 |
| 2017-06-16 | 1.192308e+08 | 27.694268 | 208384.0 | 44.923567 | 2017.0 | 6.0 | 17.012739 | 2502.0 | 1.0 | 8.356688 | 44.73 | 5.612348 |
| 2017-06-19 | 1.194465e+08 | 26.836735 | 208384.0 | 39.040816 | 2017.0 | 6.0 | 19.000000 | 2502.0 | 1.0 | 8.816327 | 44.24 | 4.511432 |
#exclude the rows with z score more than 4
from scipy.stats import zscore
train_subset_z = train_subset_z[(np.abs(zscore(train_subset_z['z_score'])) < 2)]
unit_sales_z = train_subset_z['z_score'].values
plt.scatter(x = range(unit_sales_z.shape[0]), y = np.sort(unit_sales_z))
<matplotlib.collections.PathCollection at 0x16a0dabc4f0>
train_subset_z.shape
(650, 12)
# Outlier Analysis
fig, axs = plt.subplots(1, figsize = (5,5))
plt1 = sns.boxplot(train_subset_z['unit_sales'],color='red')
plt.tight_layout()
train_subset_s = train_subset_z.copy()
# histogram plot
plt.hist(train_subset_s['unit_sales'])
plt.show()
train_subset_s[['unit_sales']].plot(kind='density')
<AxesSubplot:ylabel='Density'>
train_subset_ss = train_subset_s.copy()
from sklearn.preprocessing import StandardScaler
scale= StandardScaler()
scaled_data = scale.fit_transform(train_subset_ss)
print(scaled_data)
[[-1.51371483 -1.2590777 0. ... 1.7856534 0.87487397 3.66851288] [-1.50874853 2.27609432 0. ... -0.70492353 0.48489209 3.21227713] [-1.5064842 0.18331242 0. ... -0.02875468 0.18254658 1.24752267] ... [ 1.90644586 2.03714169 0. ... -0.70877896 0.3037769 0.12075583] [ 1.91478561 1.51688713 0. ... 0.48844984 0.12558293 0.01352576] [ 1.91883872 1.26483305 0. ... 1.19580309 0.12266172 0.21229742]]
train_subset_ss.drop(columns=['id','z_score'],inplace=True)
train_subset_ss[['unit_sales']].plot(kind='density')
<AxesSubplot:ylabel='Density'>
train_corr = train_subset_d[['unit_sales','cluster','oil_price']].corr(method='pearson')
train_corr
g = sns.heatmap(train_corr,vmax=6,center=0,
square = True,linewidths=5,cbar_kws = {'shrink':.5},annot=True,fmt='.2f',cmap='twilight')
g.figure.set_size_inches(10,10)
train_corr = train_subset_ss[['unit_sales','cluster','oil_price']].corr(method='pearson')
train_corr
| unit_sales | cluster | oil_price | |
|---|---|---|---|
| unit_sales | 1.000000 | 0.105611 | 0.081837 |
| cluster | 0.105611 | 1.000000 | 0.207345 |
| oil_price | 0.081837 | 0.207345 | 1.000000 |
df_decomp = train_subset_s.copy()
df_decomp
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | z_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | ||||||||||||
| 2015-01-02 | 3.868373e+07 | 26.661417 | 208384.0 | 25.283465 | 2015.0 | 1.0 | 2.968504 | 2502.0 | 1.0 | 8.881890 | 52.72 | 1.936839 |
| 2015-01-05 | 3.880926e+07 | 28.600000 | 208384.0 | 23.875000 | 2015.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.325000 | 50.05 | 1.673255 |
| 2015-01-06 | 3.886649e+07 | 27.452381 | 208384.0 | 17.809524 | 2015.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.476190 | 47.98 | 0.538143 |
| 2015-01-07 | 3.892401e+07 | 27.139535 | 208384.0 | 17.488372 | 2015.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.488372 | 48.69 | 0.478042 |
| 2015-01-08 | 3.898065e+07 | 26.170732 | 208384.0 | 12.121951 | 2015.0 | 1.0 | 8.000000 | 2502.0 | 1.0 | 9.170732 | 48.80 | -0.526247 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 | -0.630436 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 | -1.070134 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 | -0.112832 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 | -0.174783 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 | -0.059945 |
650 rows × 12 columns
import statsmodels.api as sm
res = sm.tsa.seasonal_decompose(df_decomp['unit_sales'],
model='additive',period=2,extrapolate_trend='freq')
resplot = res.plot()
res.trend.plot()
<AxesSubplot:xlabel='date'>
res.seasonal.plot()
<AxesSubplot:xlabel='date'>
res.resid.plot()
<AxesSubplot:xlabel='date'>
res.observed #ACTUAL UNIT_SALES Value
date
2015-01-02 25.283465
2015-01-05 23.875000
2015-01-06 17.809524
2015-01-07 17.488372
2015-01-08 12.121951
...
2017-08-09 11.565217
2017-08-10 9.215686
2017-08-11 14.331034
2017-08-14 14.000000
2017-08-15 14.613636
Name: unit_sales, Length: 650, dtype: float64
print(res.trend)
date
2015-01-02 26.175889
2015-01-05 22.710747
2015-01-06 19.245605
2015-01-07 16.227055
2015-01-08 15.091983
...
2017-08-09 10.793977
2017-08-10 11.081906
2017-08-11 12.969439
2017-08-14 14.236168
2017-08-15 16.744504
Name: trend, Length: 650, dtype: float64
print(res.seasonal)
date
2015-01-02 -0.039568
2015-01-05 0.039568
2015-01-06 -0.039568
2015-01-07 0.039568
2015-01-08 -0.039568
...
2017-08-09 0.039568
2017-08-10 -0.039568
2017-08-11 0.039568
2017-08-14 -0.039568
2017-08-15 0.039568
Name: seasonal, Length: 650, dtype: float64
res.trend[1] + res.seasonal[1] + res.resid[1]
23.875
pd.DataFrame(res.observed-res.trend).plot()
<AxesSubplot:xlabel='date'>
detrend = pd.DataFrame(res.observed/res.trend)
#detrend.rename(columns = {0:'detrend_sales'},inplace=True)
detrend
| 0 | |
|---|---|
| date | |
| 2015-01-02 | 0.965907 |
| 2015-01-05 | 1.051264 |
| 2015-01-06 | 0.925381 |
| 2015-01-07 | 1.077729 |
| 2015-01-08 | 0.803205 |
| ... | ... |
| 2017-08-09 | 1.071451 |
| 2017-08-10 | 0.831598 |
| 2017-08-11 | 1.104985 |
| 2017-08-14 | 0.983411 |
| 2017-08-15 | 0.872742 |
650 rows × 1 columns
detrend.plot()
<AxesSubplot:xlabel='date'>
train_subset_sbkp = train_subset_s.copy()
train_subset_s = pd.concat([train_subset_s,detrend[0]],axis=1)
train_subset_s = train_subset_s.rename(columns={0:'unit_sales_detrend'})
train_subset_s[['unit_sales','unit_sales_detrend']].plot(figsize=(8,7))
<AxesSubplot:xlabel='date'>
train_subset_s['unit_sales_detrend'].plot()
<AxesSubplot:xlabel='date'>
#train_subset_s['unit_sales_detrend'] = train_subset_s['unit_sales_detrend'].fillna(method='bfill')
train_subset_s.isnull().sum()
id 0 store_nbr 0 item_nbr 0 unit_sales 0 Year 0 Month 0 Day 0 class 0 perishable 0 cluster 0 oil_price 0 z_score 0 unit_sales_detrend 0 dtype: int64
train_subset_s.isnull().sum()
id 0 store_nbr 0 item_nbr 0 unit_sales 0 Year 0 Month 0 Day 0 class 0 perishable 0 cluster 0 oil_price 0 z_score 0 unit_sales_detrend 0 dtype: int64
from statsmodels.tsa.stattools import adfuller
result = adfuller(train_subset_s['unit_sales_detrend'])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t%s: %.3f' % (key, value))
if result[0] < result[4]["5%"]:
print ("Reject Ho - Time Series is Stationary")
else:
print ("Failed to Reject Ho - Time Series is Non-Stationary")
ADF Statistic: -7.626554 p-value: 0.000000 Critical Values: 1%: -3.441 5%: -2.866 10%: -2.569 Reject Ho - Time Series is Stationary
from statsmodels.tsa.stattools import kpss
tstest = kpss(train_subset_s['unit_sales_detrend'],'ct')
C:\Users\u2195687\Anaconda3\envs\tensorflow\lib\site-packages\statsmodels\tsa\stattools.py:2022: InterpolationWarning: The test statistic is outside of the range of p-values available in the look-up table. The actual p-value is greater than the p-value returned.
tstest
(0.07263398762086405,
0.1,
29,
{'10%': 0.119, '5%': 0.146, '2.5%': 0.176, '1%': 0.216})
train_subset_s
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | z_score | unit_sales_detrend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||
| 2015-01-02 | 3.868373e+07 | 26.661417 | 208384.0 | 25.283465 | 2015.0 | 1.0 | 2.968504 | 2502.0 | 1.0 | 8.881890 | 52.72 | 1.936839 | 0.965907 |
| 2015-01-05 | 3.880926e+07 | 28.600000 | 208384.0 | 23.875000 | 2015.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.325000 | 50.05 | 1.673255 | 1.051264 |
| 2015-01-06 | 3.886649e+07 | 27.452381 | 208384.0 | 17.809524 | 2015.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.476190 | 47.98 | 0.538143 | 0.925381 |
| 2015-01-07 | 3.892401e+07 | 27.139535 | 208384.0 | 17.488372 | 2015.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.488372 | 48.69 | 0.478042 | 1.077729 |
| 2015-01-08 | 3.898065e+07 | 26.170732 | 208384.0 | 12.121951 | 2015.0 | 1.0 | 8.000000 | 2502.0 | 1.0 | 9.170732 | 48.80 | -0.526247 | 0.803205 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 | -0.630436 | 1.071451 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 | -1.070134 | 0.831598 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 | -0.112832 | 1.104985 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 | -0.174783 | 0.983411 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 | -0.059945 | 0.872742 |
650 rows × 13 columns
train_split = train_subset_s.loc['2015-01-01':'2017-08-08']
test_split = train_subset_s.loc['2017-08-09':'2017-08-15']
train_split
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | z_score | unit_sales_detrend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||
| 2015-01-02 | 3.868373e+07 | 26.661417 | 208384.0 | 25.283465 | 2015.0 | 1.0 | 2.968504 | 2502.0 | 1.0 | 8.881890 | 52.72 | 1.936839 | 0.965907 |
| 2015-01-05 | 3.880926e+07 | 28.600000 | 208384.0 | 23.875000 | 2015.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.325000 | 50.05 | 1.673255 | 1.051264 |
| 2015-01-06 | 3.886649e+07 | 27.452381 | 208384.0 | 17.809524 | 2015.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.476190 | 47.98 | 0.538143 | 0.925381 |
| 2015-01-07 | 3.892401e+07 | 27.139535 | 208384.0 | 17.488372 | 2015.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.488372 | 48.69 | 0.478042 | 1.077729 |
| 2015-01-08 | 3.898065e+07 | 26.170732 | 208384.0 | 12.121951 | 2015.0 | 1.0 | 8.000000 | 2502.0 | 1.0 | 9.170732 | 48.80 | -0.526247 | 0.803205 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-02 | 1.240871e+08 | 27.560000 | 208384.0 | 13.520000 | 2017.0 | 8.0 | 2.000000 | 2502.0 | 1.0 | 8.720000 | 49.60 | -0.264612 | 1.052459 |
| 2017-08-03 | 1.241925e+08 | 28.021277 | 208384.0 | 11.042553 | 2017.0 | 8.0 | 3.000000 | 2502.0 | 1.0 | 8.319149 | 49.03 | -0.728249 | 0.868861 |
| 2017-08-04 | 1.244067e+08 | 28.211921 | 208384.0 | 15.231788 | 2017.0 | 8.0 | 5.019868 | 2502.0 | 1.0 | 8.476821 | 49.57 | 0.055738 | 1.104569 |
| 2017-08-07 | 1.246219e+08 | 28.204082 | 208384.0 | 13.653061 | 2017.0 | 8.0 | 7.000000 | 2502.0 | 1.0 | 8.489796 | 49.37 | -0.239710 | 1.023320 |
| 2017-08-08 | 1.247256e+08 | 28.893617 | 208384.0 | 10.829787 | 2017.0 | 8.0 | 8.000000 | 2502.0 | 1.0 | 8.319149 | 49.07 | -0.768066 | 0.924086 |
645 rows × 13 columns
test_split
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | z_score | unit_sales_detrend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 | -0.630436 | 1.071451 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 | -1.070134 | 0.831598 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 | -0.112832 | 1.104985 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 | -0.174783 | 0.983411 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 | -0.059945 | 0.872742 |
df_sma = train_subset_s.copy()
train_subset_s.drop
<bound method DataFrame.drop of id store_nbr item_nbr unit_sales Year Month \
date
2015-01-02 3.868373e+07 26.661417 208384.0 25.283465 2015.0 1.0
2015-01-05 3.880926e+07 28.600000 208384.0 23.875000 2015.0 1.0
2015-01-06 3.886649e+07 27.452381 208384.0 17.809524 2015.0 1.0
2015-01-07 3.892401e+07 27.139535 208384.0 17.488372 2015.0 1.0
2015-01-08 3.898065e+07 26.170732 208384.0 12.121951 2015.0 1.0
... ... ... ... ... ... ...
2017-08-09 1.248265e+08 28.847826 208384.0 11.565217 2017.0 8.0
2017-08-10 1.249255e+08 28.411765 208384.0 9.215686 2017.0 8.0
2017-08-11 1.251311e+08 28.468966 208384.0 14.331034 2017.0 8.0
2017-08-14 1.253419e+08 28.183673 208384.0 14.000000 2017.0 8.0
2017-08-15 1.254443e+08 28.045455 208384.0 14.613636 2017.0 8.0
Day class perishable cluster oil_price z_score \
date
2015-01-02 2.968504 2502.0 1.0 8.881890 52.72 1.936839
2015-01-05 5.000000 2502.0 1.0 8.325000 50.05 1.673255
2015-01-06 6.000000 2502.0 1.0 8.476190 47.98 0.538143
2015-01-07 7.000000 2502.0 1.0 8.488372 48.69 0.478042
2015-01-08 8.000000 2502.0 1.0 9.170732 48.80 -0.526247
... ... ... ... ... ... ...
2017-08-09 9.000000 2502.0 1.0 8.326087 49.59 -0.630436
2017-08-10 10.000000 2502.0 1.0 8.274510 48.54 -1.070134
2017-08-11 11.993103 2502.0 1.0 8.324138 48.81 -0.112832
2017-08-14 14.000000 2502.0 1.0 8.591837 47.59 -0.174783
2017-08-15 15.000000 2502.0 1.0 8.750000 47.57 -0.059945
unit_sales_detrend
date
2015-01-02 0.965907
2015-01-05 1.051264
2015-01-06 0.925381
2015-01-07 1.077729
2015-01-08 0.803205
... ...
2017-08-09 1.071451
2017-08-10 0.831598
2017-08-11 1.104985
2017-08-14 0.983411
2017-08-15 0.872742
[650 rows x 13 columns]>
#df_sma = df_sma[['unit_sales_detrend']]
df_sma['ma_rolling_3'] = df_sma['unit_sales_detrend'].rolling(window=3).mean().shift(1)
df_sma[['unit_sales_detrend','ma_rolling_3']].plot(color=['red','green'])
<AxesSubplot:xlabel='date'>
def wma(weights):
def calc(x):
return (weights*x).mean()
return calc
df_sma['wma_rolling_3'] = df_sma['unit_sales_detrend'].rolling(window=3).apply(wma(np.array([0.5,1,1.5]))).shift(1)
df_sma[['unit_sales_detrend','wma_rolling_3']].plot(color=['blue','green'])
<AxesSubplot:xlabel='date'>
df_sma['ewm_window_3'] = df_sma['unit_sales_detrend'].ewm(span=3,adjust=False,min_periods=0).mean().shift(1)
df_sma[['unit_sales_detrend','ewm_window_3']].plot(color=['green','red'])
<AxesSubplot:xlabel='date'>
def rmse(x,y):
return ((df_sma[x]-df_sma[y])**2).mean()**0.5
print('Moving Average')
print(rmse('unit_sales_detrend','ma_rolling_3'))
print('Weighted Moving Average')
print(rmse('unit_sales_detrend','wma_rolling_3'))
print('Exponential Weighted Moving Average')
print(rmse('unit_sales_detrend','ewm_window_3'))
Moving Average 0.12717802366238062 Weighted Moving Average 0.1422072088070689 Exponential Weighted Moving Average 0.14627379829275172
Moving average gives the best results
df_sma[['unit_sales_detrend','ewm_window_3','wma_rolling_3','ma_rolling_3']].plot(color=['violet','red','green','blue'])
<AxesSubplot:xlabel='date'>
#!pip install pystan fbprophet
from prophet import Prophet
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime
import pandas as pd
import plotly.express as px
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
from datetime import datetime
import pandas as pd
import plotly.express as px
mpl.rcParams['figure.figsize'] = (10, 8)
mpl.rcParams['axes.grid'] = False
#train_subset_d
df_pro = train_subset_d.copy()
df_pro
| id | store_nbr | item_nbr | unit_sales | Year | Month | Day | class | perishable | cluster | oil_price | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||
| 2015-01-01 | 3.859434e+07 | 25.000000 | 208384.0 | 37.000000 | 2015.0 | 1.0 | 1.000000 | 2502.0 | 1.0 | 1.000000 | NaN |
| 2015-01-02 | 3.868373e+07 | 26.661417 | 208384.0 | 25.283465 | 2015.0 | 1.0 | 2.968504 | 2502.0 | 1.0 | 8.881890 | 52.72 |
| 2015-01-05 | 3.880926e+07 | 28.600000 | 208384.0 | 23.875000 | 2015.0 | 1.0 | 5.000000 | 2502.0 | 1.0 | 8.325000 | 50.05 |
| 2015-01-06 | 3.886649e+07 | 27.452381 | 208384.0 | 17.809524 | 2015.0 | 1.0 | 6.000000 | 2502.0 | 1.0 | 8.476190 | 47.98 |
| 2015-01-07 | 3.892401e+07 | 27.139535 | 208384.0 | 17.488372 | 2015.0 | 1.0 | 7.000000 | 2502.0 | 1.0 | 8.488372 | 48.69 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2017-08-09 | 1.248265e+08 | 28.847826 | 208384.0 | 11.565217 | 2017.0 | 8.0 | 9.000000 | 2502.0 | 1.0 | 8.326087 | 49.59 |
| 2017-08-10 | 1.249255e+08 | 28.411765 | 208384.0 | 9.215686 | 2017.0 | 8.0 | 10.000000 | 2502.0 | 1.0 | 8.274510 | 48.54 |
| 2017-08-11 | 1.251311e+08 | 28.468966 | 208384.0 | 14.331034 | 2017.0 | 8.0 | 11.993103 | 2502.0 | 1.0 | 8.324138 | 48.81 |
| 2017-08-14 | 1.253419e+08 | 28.183673 | 208384.0 | 14.000000 | 2017.0 | 8.0 | 14.000000 | 2502.0 | 1.0 | 8.591837 | 47.59 |
| 2017-08-15 | 1.254443e+08 | 28.045455 | 208384.0 | 14.613636 | 2017.0 | 8.0 | 15.000000 | 2502.0 | 1.0 | 8.750000 | 47.57 |
684 rows × 11 columns
df_pro_a =df_pro.reset_index()[['date','unit_sales']].rename({'date':'ds','unit_sales':'y'}, axis='columns')
df_pro_b = df_pro_a.copy()
train_pro=df_pro_a[(df_pro_a['ds'] >= '2015-01-01') & (df_pro_a['ds'] <= '2017-08-09')]
test_pro=df_pro_a[(df_pro_a['ds'] > '2017-08-09')]
m = Prophet(interval_width=0.95)
m.fit(train_pro)
10:54:19 - cmdstanpy - INFO - Chain [1] start processing 10:54:19 - cmdstanpy - INFO - Chain [1] done processing
<prophet.forecaster.Prophet at 0x16a2fe18040>
#m.params
future = m.make_future_dataframe(periods=369,freq='B') # 4 is test data shape
forecast1 = m.predict(future)
forecast1.head()
| ds | trend | yhat_lower | yhat_upper | trend_lower | trend_upper | additive_terms | additive_terms_lower | additive_terms_upper | weekly | weekly_lower | weekly_upper | yearly | yearly_lower | yearly_upper | multiplicative_terms | multiplicative_terms_lower | multiplicative_terms_upper | yhat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-01-01 | 19.878751 | 7.063474 | 22.823650 | 19.878751 | 19.878751 | -5.052877 | -5.052877 | -5.052877 | -6.320405 | -6.320405 | -6.320405 | 1.267528 | 1.267528 | 1.267528 | 0.0 | 0.0 | 0.0 | 14.825874 |
| 1 | 2015-01-02 | 19.868327 | 12.787009 | 29.399220 | 19.868327 | 19.868327 | 1.124926 | 1.124926 | 1.124926 | -0.034667 | -0.034667 | -0.034667 | 1.159593 | 1.159593 | 1.159593 | 0.0 | 0.0 | 0.0 | 20.993253 |
| 2 | 2015-01-05 | 19.837053 | 10.455197 | 25.918745 | 19.837053 | 19.837053 | -1.712586 | -1.712586 | -1.712586 | -2.500785 | -2.500785 | -2.500785 | 0.788199 | 0.788199 | 0.788199 | 0.0 | 0.0 | 0.0 | 18.124467 |
| 3 | 2015-01-06 | 19.826628 | 8.378647 | 23.536835 | 19.826628 | 19.826628 | -3.979772 | -3.979772 | -3.979772 | -4.634728 | -4.634728 | -4.634728 | 0.654957 | 0.654957 | 0.654957 | 0.0 | 0.0 | 0.0 | 15.846856 |
| 4 | 2015-01-07 | 19.816203 | 9.150429 | 24.207645 | 19.816203 | 19.816203 | -3.078566 | -3.078566 | -3.078566 | -3.598074 | -3.598074 | -3.598074 | 0.519508 | 0.519508 | 0.519508 | 0.0 | 0.0 | 0.0 | 16.737637 |
prophet1 = forecast1['yhat']
pd.concat([df_pro_a.set_index('ds')['y'],forecast1.set_index('ds')['yhat']],axis=1).plot()
<AxesSubplot:xlabel='ds'>
fig1 = m.plot(forecast1)
#fig2 = m.plot_components(forecast)
from prophet.plot import add_changepoints_to_plot
fig = m.plot(forecast1)
a = add_changepoints_to_plot(fig.gca(), m, forecast1)
se = np.square(forecast1.loc[:, 'yhat'] - df_pro_a['y'])
mse1 = np.mean(se)
rmse1 = np.sqrt(mse1)
print(mse1)
print(rmse1)
16.13996392332231 4.017457395333809
from sklearn.metrics import r2_score
df_pro_a
| ds | y | |
|---|---|---|
| 0 | 2015-01-01 | 37.000000 |
| 1 | 2015-01-02 | 25.283465 |
| 2 | 2015-01-05 | 23.875000 |
| 3 | 2015-01-06 | 17.809524 |
| 4 | 2015-01-07 | 17.488372 |
| ... | ... | ... |
| 679 | 2017-08-09 | 11.565217 |
| 680 | 2017-08-10 | 9.215686 |
| 681 | 2017-08-11 | 14.331034 |
| 682 | 2017-08-14 | 14.000000 |
| 683 | 2017-08-15 | 14.613636 |
684 rows × 2 columns
print('mse value is',((forecast1.loc[:, 'yhat'] - df_pro_a['y'])**2).mean()**0.5)
print('rmse value is',((forecast1.loc[:, 'yhat'] - df_pro_a['y']) ** 2).mean())
print('mape value is',np.mean(np.abs(((forecast1.loc[:, 'yhat'] - df_pro_a['y'])) / forecast1.loc[:, 'yhat'])) * 100)
print('r_squared is',r2_score(forecast1.loc[:683, 'yhat'], df_pro_a['y']))
mse value is 4.017457395333809 rmse value is 16.13996392332231 mape value is 17.356442427763756 r_squared is -0.33206816920676907
pro_change= Prophet(interval_width=0.95,changepoint_range = 0.95, daily_seasonality=True)
forecast2 = pro_change.fit(train_pro).predict(future)
fig4 = pro_change.plot(forecast2);
b = add_changepoints_to_plot(fig4.gca(), pro_change, forecast2)
10:54:51 - cmdstanpy - INFO - Chain [1] start processing 10:54:51 - cmdstanpy - INFO - Chain [1] done processing
ae = np.square(forecast2.loc[:, 'yhat'] - df_pro_b['y'])
mse2 = np.mean(ae)
rmse2 = np.sqrt(mse2)
print(mse2)
print(rmse2)
16.14996618503385 4.018702052284276
print('mse value is',((forecast2.loc[:, 'yhat'] - df_pro_b['y'])**2).mean()**0.5)
print('rmse value is',((forecast2.loc[:, 'yhat'] - df_pro_b['y']) ** 2).mean())
print('mape value is',np.mean(np.abs(((forecast2.loc[:, 'yhat'] - df_pro_b['y'])) / forecast2.loc[:, 'yhat'])) * 100)
print('r_squared is',r2_score(forecast2.loc[:683, 'yhat'], df_pro_b['y']))
mse value is 4.018702052284276 rmse value is 16.14996618503385 mape value is 17.386673768902796 r_squared is -0.32979559714441953
holi = df_holiday.copy()
holi = holi[['date','type']]
holi = holi.reset_index()[['date','type']].rename({'date':'ds','type':'holiday'}, axis='columns')
holi
| ds | holiday | |
|---|---|---|
| 0 | 2012-03-02 | Holiday |
| 1 | 2012-04-01 | Holiday |
| 2 | 2012-04-12 | Holiday |
| 3 | 2012-04-14 | Holiday |
| 4 | 2012-04-21 | Holiday |
| ... | ... | ... |
| 345 | 2017-12-22 | Additional |
| 346 | 2017-12-23 | Additional |
| 347 | 2017-12-24 | Additional |
| 348 | 2017-12-25 | Holiday |
| 349 | 2017-12-26 | Additional |
350 rows × 2 columns
hol_change= Prophet(changepoint_range = 0.90, daily_seasonality=True,holidays=holi)
forecast3 = hol_change.fit(train_pro).predict(future)
fig5 = pro_change.plot(forecast3);
c = add_changepoints_to_plot(fig5.gca(), pro_change, forecast3)
10:55:06 - cmdstanpy - INFO - Chain [1] start processing 10:55:06 - cmdstanpy - INFO - Chain [1] done processing
be = np.square(forecast3.loc[:, 'yhat'] - df_pro_b['y'])
mse3 = np.mean(be)
rmse3 = np.sqrt(mse3)
print(mse3)
print(rmse3)
15.753969731676962 3.9691270742667037
print('mse value is',((forecast3.loc[:, 'yhat'] - df_pro_b['y'])**2).mean()**0.5)
print('rmse value is',((forecast3.loc[:, 'yhat'] - df_pro_b['y']) ** 2).mean())
print('mape value is',np.mean(np.abs(((forecast3.loc[:, 'yhat'] - df_pro_b['y'])) / forecast2.loc[:, 'yhat'])) * 100)
print('r_squared is',r2_score(forecast3.loc[:683, 'yhat'], df_pro_b['y']))
mse value is 3.9691270742667037 rmse value is 15.753969731676962 mape value is 17.27983932305518 r_squared is -0.26077736638735893
Auto correlation (ACF and PACF plots)
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
df_arma = train_subset.resample('B').mean()
fig = plt.figure(figsize=(15,8))
ax1= fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(df_arma['unit_sales'].iloc[13:],lags=40,ax=ax1)
ax2= fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(df_arma['unit_sales'].iloc[13:],lags=40,ax=ax2)
type(df_arma)
pandas.core.frame.DataFrame
df = df_arma['unit_sales'].squeeze()
type(df)
pandas.core.series.Series
X = df
train_size = int(len(X) * 0.75)
trainset, testset= X[0:train_size], X[train_size:]
def performance(y_true, y_pred):
mse = ((y_pred - y_true) ** 2).mean()
mape= np.mean(np.abs((y_true - y_pred) / y_true)) * 100
r_squared = r2_score(y_true,y_pred)
performance_data= {'MSE':round(mse, 2),
'RMSE':round(np.sqrt(mse), 2),
'MAPE':round(mape, 2),
'RSquared':round(r_squared, 2)
}
return performance_data
def performance2(y_true, y_pred):
#y_true, y_pred = np.array(y_true), np.array(y_pred)
mse = ((y_pred - y_true) ** 2).mean()
mape= np.mean(np.abs((y_true - y_pred) / y_true)) * 100
return( print(' The MSE of forecasts is {}'.format(round(mse, 2))+
'\n The RMSE of forecasts is {}'.format(round(np.sqrt(mse), 2))+
'\n The MAPE of forecasts is {}'.format(round(mape, 2))))
df.values
array([37. , 25.28346457, 23.875 , 17.80952381, 17.48837209,
12.12195122, 18.63565891, 16.48780488, 12.97560976, 14.54761905,
11.86363636, 19.74045802, 17.28571429, 14.0952381 , 14.65116279,
11.81818182, 18.08088235, 15.4047619 , 13.18604651, 13.55813953,
13.37209302, 25.34074074, 19.79069767, 15.11111111, 17.73809524,
14.18604651, 21.71532847, 19.95555556, 18.125 , 16.73809524,
16.63157895, 18.13475177, 17.22222222, 20.87234043, 19.38297872,
15.69565217, 20.97080292, 17.04347826, 13.84782609, 15.74418605,
11.70833333, 21.34782609, 18.02040816, 16.70454545, 16.11111111,
13. , 19.30985915, 19.9047619 , 13.10638298, 14.51111111,
10.35416667, 17.58865248, 15.57446809, 14.81578947, 14.325 ,
11.88372093, 18.27857143, 15.52173913, 11.65957447, 13.34782609,
10.90909091, 18.32142857, 15.22727273, 12.68085106, 19.20454545,
15.21276596, 19.02941176, 17.11111111, 13.04255319, 13.54166667,
11.10638298, 17.86029412, 14.52173913, 12.31111111, 12.93877551,
11.76086957, 19.26470588, 14.57142857, 12.10638298, 13.11111111,
11.74418605, 17. , 14.60416667, 11.08510638, 12.19565217,
13.2173913 , 19.47445255, 16.91111111, 14.36363636, 14.44186047,
11.57777778, 16.99264706, 15.75 , 11.7755102 , 11.31914894,
9.64583333, 20.04895105, 19.55555556, 17.13333333, 18.11111111,
12.48837209, 17.75714286, 15.36956522, 10.80434783, 13.32608696,
11.73170732, 19.20567376, 16.58333333, 13.25 , 14.93181818,
11.11627907, 17.52554745, 14.5 , 11.67391304, 12.3255814 ,
8.17777778, 16.81884058, 14.55319149, 12.41304348, 12.39534884,
10.54545455, 17.24087591, 16.02173913, 12.06521739, 12.44186047,
10.11627907, 16.90714286, 13.25 , 12.375 , 15.21276596,
11.23255814, 17.20652174, 13.625 , 13.88372093, 15.37777778,
12.27083333, 16.7037037 , 13.1875 , 12.33333333, 12.375 ,
12.61904762, 17.45714286, 16.2826087 , 13.97674419, 13.39130435,
12.2826087 , 16.39583333, 13.64705882, 12.77083333, 13.17021277,
10.91111111, 17.66896552, 16.85106383, 13.58695652, 13.8 ,
10.84444444, 13.99305556, 18.81632653, 13.98 , 14.02083333,
11.33333333, 17.74637681, 15.2244898 , 13.04444444, 12.06521739,
10.45652174, 16.5 , 14.69387755, 12.46938776, 13.11111111,
11.17021277, 16.75694444, 15.6875 , 15.04081633, 14.57142857,
13.2 , 17.67333333, 12.80392157, 12.08333333, 11.86 ,
10.73913043, 17.33823529, 14.53333333, 11.26 , 13.80851064,
9.55102041, 16.22297297, 12.79166667, 11.40425532, 13.25 ,
9.83333333, 16.46308725, 14.63043478, 10.46 , 13.10638298,
13.29166667, 17.6 , 14.64583333, 11.93478261, 12.63636364,
9.69565217, 16.30821918, 13.75510204, 9.93617021, 11.75 ,
10.44444444, 16.71621622, 14.12765957, 13.36170213, 12.32608696,
9.22 , 16.3537415 , 11.70588235, 9.7254902 , 11.56818182,
9.76744186, 14.37931034, 13.6 , 17.74509804, 15.71428571,
11.5 , 16.53289474, 12.86 , 10.42857143, 12.34782609,
9.10869565, 17.04137931, 15.04 , 11.23404255, 11.86956522,
10.375 , 16.76870748, 14.6875 , 11.35416667, 13.2173913 ,
9.33333333, 16.65248227, 15.33333333, 14.2826087 , 13.89361702,
11.10638298, 16.65306122, 15.02040816, 13.17021277, 12.57142857,
11.17021277, 16.29370629, 14.53061224, 12.44 , 13.125 ,
11.04444444, 15.91946309, 16.12 , 13.58823529, 16.67346939,
14.58 , 15.28125 , 15.15555556, 13.3 , 17.44 ,
14.08 , 21.36 , 19.55102041, 15.75555556, 14.14 ,
11.54545455, 17.22758621, 14.47058824, 11.2745098 , 11.70833333,
10.88 , 17.13907285, 14.52083333, 10.58333333, 14.57142857,
11.27083333, 16.88356164, 14.35416667, 12.34090909, 13.13333333,
9.75510204, 17.48993289, 16.32692308, 12.91836735, 13.78723404,
11.68085106, 15.0472973 , 13. , 16.41176471, 15.34 ,
12.20833333, 16.39072848, 15.10204082, 13.80392157, 14.11764706,
10.19565217, 16.5 , 13.25531915, 11.4893617 , 12.44897959,
10.10638298, 16.65277778, 13.14285714, 13.91489362, 13.58695652,
10.82 , 17.59589041, 14.9787234 , 11.375 , 12. ,
11.04444444, 16.37671233, 14.3 , 12.64583333, 13.8 ,
11.37777778, 15.59863946, 15.04255319, 11.28571429, 14.36170213,
11.54 , 15.21192053, 14.34693878, 12.08695652, 13.38297872,
9.87234043, 19.26797386, 15.14583333, 11.71428571, 12.23913043,
9.04081633, 15.69333333, 12.56862745, 10.73469388, 10.85714286,
9.32608696, 16.36734694, 15.81632653, 11.60416667, 13.22916667,
10.91111111, 16.87323944, 13.56 , 10.52 , 11.125 ,
9.31372549, 18.65346535, 15.57692308, 13.125 , 13.28571429,
9.72 , 16.0311284 , 14.84615385, 11.25 , 13.18 ,
11.08695652, 17.57931034, 14.31914894, 12.09090909, 11.44680851,
11.14583333, 16.73154362, 12.37254902, 9.97959184, 11.65306122,
10.1875 , 16.54605263, 14.16 , 13.46 , 14.26086957,
10.89130435, 17.20138889, 13.8 , 10.8 , 12.34042553,
10.45652174, 15.58333333, 12.98 , 11.32 , 11.49019608,
10.36734694, 15.99305556, 15.02083333, 11.65217391, 12.15555556,
11.72727273, 15.95102041, 13.64705882, 9.96153846, 11.12 ,
9.97826087, 18.35960591, 15.79591837, 13. , 11.91836735,
10.65957447, 12.26153846, 5.76666667, 6.22222222, 11.05128205,
10.9047619 , 15.99275362, 12.95555556, 11.72340426, 12.2 ,
11.21428571, 15.14583333, 13.875 , 10.61702128, 12.64444444,
9.58333333, 15.21621622, 16.95652174, 11.60416667, 12.25531915,
9.88636364, 14.81690141, 13.7173913 , 11.33333333, 11.04347826,
11.79545455, 14.65467626, 13.46 , 13.59574468, 14.07142857,
9.97826087, 15.81560284, 15.52173913, 12. , 11.4893617 ,
10.39583333, 15.38732394, 13.56 , 11.86666667, 12.43181818,
11.55813953, 16.29577465, 14.32653061, 10.26530612, 12.2826087 ,
10.29787234, 15.10416667, 12.46666667, 11.46666667, 11.92857143,
10.13333333, 15.62937063, 13.0212766 , 10.4893617 , 12.02272727,
9.45652174, 15.38888889, 13.13043478, 13.20930233, 13.27083333,
11.57777778, 20.40559441, 16.5106383 , 13.40909091, 14.5106383 ,
11.91304348, 19.84057971, 16.10638298, 13.59090909, 13.32653061,
12.15217391, 18.76870748, 15.52083333, 12.97826087, 13.65909091,
13.53488372, 19.15107914, 14.91489362, 13.61702128, 15.64444444,
15.67391304, 28.99285714, 22.32653061, 25. , 30.20833333,
26.54347826, 31.57241379, 31.65957447, 25.4893617 , 28.91111111,
19.75555556, 38. , 28.65957447, 24.22916667, 29.35555556,
11.83333333, 18.04166667, 13.92156863, 12.04444444, 13.48888889,
9.04166667, 15.83216783, 13.24 , 10.74468085, 11.08333333,
11.125 , 16.77241379, 12.98076923, 14.62 , 14.14285714,
10.56818182, 14.47972973, 13.25531915, 10.60416667, 11.21276596,
9.5625 , 14.90277778, 13.84313725, 13.04255319, 13.6122449 ,
11.66666667, 15.36633663, 15.39622642, 13.14583333, 14. ,
12.63265306, 15.43 , 22.42 , 18.5 , 14.6 ,
12.36170213, 15.48344371, 12.8490566 , 10.9 , 13.04166667,
9.31914894, 15.79591837, 12.1372549 , 11.36170213, 12.04166667,
9.67391304, 15.76551724, 13.67346939, 10.81632653, 12.12765957,
8.89795918, 14.70394737, 11.49056604, 12.14583333, 13.3 ,
10.84444444, 15.95973154, 13.9 , 10.53061224, 12.2826087 ,
9.65306122, 15.36912752, 12.78 , 9.28571429, 19.73469388,
19.7755102 , 29.79194631, 25.18367347, 23.70833333, 28.5625 ,
23.2745098 , 31.42567568, 25.38461538, 37.19148936, 32.9 ,
29.25 , 37.2327044 , 29.26415094, 26.86792453, 32.48076923,
14.03773585, 21.44654088, 20.73076923, 18.41509434, 17.41509434,
15.13461538, 23.78481013, 19.43396226, 18.41509434, 24.09433962,
12.06666667, 17.63120567, 14.59183673, 11.125 , 13.01960784,
9.66 , 18.14379085, 13.14285714, 12.19607843, 13.97916667,
9.62 , 16.66901408, 12.76470588, 12.08163265, 12.90909091,
12.66 , 14.90862944, 13.55102041, 12.125 , 12.63043478,
10.76744186, 15.65306122, 13.05769231, 10.42 , 11.24489796,
8.02 , 13.95364238, 18.35294118, 14.98 , 12.90196078,
9.04081633, 14.93243243, 12.54901961, 11.40816327, 10.69387755,
8.94117647, 14.85714286, 13.25490196, 13.68 , 13.42857143,
10.39583333, 16.13071895, 12.80392157, 10.89795918, 11.59574468,
9.41176471, 15.46308725, 12.92 , 11.125 , 11.05882353,
10.23076923, 36.65625 , 28.7 , 33.72916667, 39.70833333,
30.21568627, 49.01290323, 37.18867925, 35.80769231, 40.43137255,
33.36 , 44.92356688, 39.04081633, 34.95918367, 32.36170213,
10.08 , 17.41960784, 14.28846154, 10.56603774, 12.53061224,
11.04255319, 16.7012987 , 15.59615385, 12.66 , 13.89583333,
9.79166667, 17.10884354, 13.8 , 11.72 , 12.30434783,
9.85714286, 15.7114094 , 15.60784314, 11.92 , 13.02 ,
10.53061224, 15.59060403, 14.34615385, 10.89583333, 12.80851064,
10.10638298, 15.05769231, 15.28846154, 13.30188679, 13.52 ,
11.04255319, 15.23178808, 13.65306122, 10.82978723, 11.56521739,
9.21568627, 14.33103448, 14. , 14.61363636])
import warnings
from pandas import Series
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
# evaluate an ARIMA model
def evaluate_arima_model(X, arima_order):
# prepare training dataset
train_size = int(len(X) * 0.75)
train, test = X[0:train_size], X[train_size:]
history = [x for x in train]
# make predictions
predictions = list()
for t in range(len(test)):
model = ARIMA(history, order=arima_order)
model_fit = model.fit(disp=0)
yhat = model_fit.forecast()[0]
predictions.append(yhat)
history.append(test[t])
# calculate out of sample error
error = mean_squared_error(test, predictions)
return error
# evaluate the combinations of p, d and q values for an ARIMA model
def evaluate_models(dataset, p_values, d_values, q_values):
dataset = dataset.astype('float32')
best_score, best_cfg = float("inf"), None
for p in p_values:
for d in d_values:
for q in q_values:
order = (p,d,q)
try:
mse = evaluate_arima_model(dataset, order)
if mse < best_score:
best_score, best_cfg = mse, order
print('ARIMA%s MSE=%.3f' % (order,mse))
except:
continue
print('Best ARIMA%s MSE=%.3f' % (best_cfg, best_score))
# evaluate parameters
p_values = [0, 1, 1, 1,1]
d_values = range(1, 2)
q_values = range(1, 2)
warnings.filterwarnings("ignore")
evaluate_models(df.values, p_values, d_values, q_values)
Best ARIMANone MSE=inf
trainset.tail(12)
date 2016-12-02 16.772414 2016-12-05 12.980769 2016-12-06 14.620000 2016-12-07 14.142857 2016-12-08 10.568182 2016-12-09 14.479730 2016-12-12 13.255319 2016-12-13 10.604167 2016-12-14 11.212766 2016-12-15 9.562500 2016-12-16 14.902778 2016-12-19 13.843137 Freq: B, Name: unit_sales, dtype: float64
from statsmodels.tsa.arima.model import ARIMA
model_arima = ARIMA(trainset, order = (10,0,8))
model_arima_fit = model_arima.fit()
arima_predict = model_arima_fit.predict(start=pd.to_datetime('2016-11-21'), end=pd.to_datetime('2017-08-25')
,dynamic=False)
# One step ahead forecast
#observed plot
ax = df.plot(label='Observed',color='#2574BF')
#predicted plot
#rcParams['figure.figsize'] = 14, 7
arima_predict.plot(ax=ax, label='ARIMA (10,0,8) Prediction', alpha= 0.7, color='red')
plt.title('ARIMA sales forecasting')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()
arima_results= performance(df[-200:],arima_predict)
arima_results
{'MSE': 61.59, 'RMSE': 7.85, 'MAPE': 21.46, 'RSquared': -0.17}
# One step ahead forecast
#observed plot
ax = df.plot(label='Observed',color='#2574BF')
#predicted plot
#rcParams['figure.figsize'] = 14, 7
#arma_predict.plot(ax=ax,label='ARMA (1,1) Prediction', linestyle= '-.', alpha= 0.7, color='r')
arima_predict.plot(ax=ax, label='ARIMA Prediction', linestyle= "--" ,alpha= 0.7, color='g')
plt.title('ARMA(1,1) and ARIMA(10,0,8) sales forecasting comparison')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()
arima_predict
2016-11-21 14.137415
2016-11-22 11.096202
2016-11-23 14.361623
2016-11-24 7.784645
2016-11-25 16.817891
...
2017-08-21 15.587973
2017-08-22 13.515230
2017-08-23 13.205784
2017-08-24 13.027015
2017-08-25 16.454207
Freq: B, Name: predicted_mean, Length: 200, dtype: float64
df[-12:]
date 2017-07-31 15.288462 2017-08-01 13.301887 2017-08-02 13.520000 2017-08-03 11.042553 2017-08-04 15.231788 2017-08-07 13.653061 2017-08-08 10.829787 2017-08-09 11.565217 2017-08-10 9.215686 2017-08-11 14.331034 2017-08-14 14.000000 2017-08-15 14.613636 Freq: B, Name: unit_sales, dtype: float64
!pip install pmdarima
Requirement already satisfied: pmdarima in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (2.0.1) Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (63.4.1) Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (0.29.32) Requirement already satisfied: scipy>=1.3.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.7.1) Requirement already satisfied: urllib3 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.26.11) Requirement already satisfied: pandas>=0.19 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.4.4) Requirement already satisfied: statsmodels>=0.13.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (0.13.2) Requirement already satisfied: joblib>=0.11 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.1.0) Requirement already satisfied: scikit-learn>=0.22 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.1.2) Requirement already satisfied: numpy>=1.21 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pmdarima) (1.22.4) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pandas>=0.19->pmdarima) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pandas>=0.19->pmdarima) (2022.2.1) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from scikit-learn>=0.22->pmdarima) (3.1.0) Requirement already satisfied: packaging>=21.3 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (21.3) Requirement already satisfied: patsy>=0.5.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from statsmodels>=0.13.2->pmdarima) (0.5.2) Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9) Requirement already satisfied: six in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from patsy>=0.5.2->statsmodels>=0.13.2->pmdarima) (1.16.0)
## Find optimal order
import pmdarima as pm
model_1 = pm.auto_arima(trainset,seasonal=True, m=12,d=0, D=1, max_p=2, max_q=2,
trace=True,error_action='ignore',suppress_warnings=True)
# Print model summary
print(model_1.summary())
#best model is Fit ARIMA: ARIMA(2,0,2) and seasonal_order=(2, 1, 1, 12); AIC=2639,
Performing stepwise search to minimize aic
ARIMA(2,0,2)(1,1,1)[12] intercept : AIC=inf, Time=2.63 sec
ARIMA(0,0,0)(0,1,0)[12] intercept : AIC=3058.698, Time=0.04 sec
ARIMA(1,0,0)(1,1,0)[12] intercept : AIC=2888.546, Time=0.30 sec
ARIMA(0,0,1)(0,1,1)[12] intercept : AIC=inf, Time=1.21 sec
ARIMA(0,0,0)(0,1,0)[12] : AIC=3057.083, Time=0.03 sec
ARIMA(1,0,0)(0,1,0)[12] intercept : AIC=2958.290, Time=0.10 sec
ARIMA(1,0,0)(2,1,0)[12] intercept : AIC=2840.514, Time=0.75 sec
ARIMA(1,0,0)(2,1,1)[12] intercept : AIC=inf, Time=2.13 sec
ARIMA(1,0,0)(1,1,1)[12] intercept : AIC=inf, Time=1.09 sec
ARIMA(0,0,0)(2,1,0)[12] intercept : AIC=2899.630, Time=0.86 sec
ARIMA(2,0,0)(2,1,0)[12] intercept : AIC=2798.428, Time=0.84 sec
ARIMA(2,0,0)(1,1,0)[12] intercept : AIC=2862.291, Time=0.39 sec
ARIMA(2,0,0)(2,1,1)[12] intercept : AIC=inf, Time=2.95 sec
ARIMA(2,0,0)(1,1,1)[12] intercept : AIC=inf, Time=1.60 sec
ARIMA(2,0,1)(2,1,0)[12] intercept : AIC=2769.263, Time=1.30 sec
ARIMA(2,0,1)(1,1,0)[12] intercept : AIC=2837.743, Time=0.66 sec
ARIMA(2,0,1)(2,1,1)[12] intercept : AIC=inf, Time=3.20 sec
ARIMA(2,0,1)(1,1,1)[12] intercept : AIC=inf, Time=1.74 sec
ARIMA(1,0,1)(2,1,0)[12] intercept : AIC=2786.759, Time=1.04 sec
ARIMA(2,0,2)(2,1,0)[12] intercept : AIC=2762.202, Time=2.02 sec
ARIMA(2,0,2)(1,1,0)[12] intercept : AIC=inf, Time=1.68 sec
ARIMA(2,0,2)(2,1,1)[12] intercept : AIC=inf, Time=3.77 sec
ARIMA(1,0,2)(2,1,0)[12] intercept : AIC=2763.191, Time=1.48 sec
ARIMA(2,0,2)(2,1,0)[12] : AIC=2760.371, Time=0.81 sec
ARIMA(2,0,2)(1,1,0)[12] : AIC=inf, Time=1.30 sec
ARIMA(2,0,2)(2,1,1)[12] : AIC=2639.690, Time=3.55 sec
ARIMA(2,0,2)(1,1,1)[12] : AIC=inf, Time=1.59 sec
ARIMA(2,0,2)(2,1,2)[12] : AIC=inf, Time=4.40 sec
ARIMA(2,0,2)(1,1,2)[12] : AIC=inf, Time=2.53 sec
ARIMA(1,0,2)(2,1,1)[12] : AIC=inf, Time=2.80 sec
ARIMA(2,0,1)(2,1,1)[12] : AIC=inf, Time=2.75 sec
ARIMA(1,0,1)(2,1,1)[12] : AIC=inf, Time=2.92 sec
Best model: ARIMA(2,0,2)(2,1,1)[12]
Total fit time: 54.491 seconds
SARIMAX Results
============================================================================================
Dep. Variable: y No. Observations: 513
Model: SARIMAX(2, 0, 2)x(2, 1, [1], 12) Log Likelihood -1311.845
Date: Thu, 01 Sep 2022 AIC 2639.690
Time: 15:05:37 BIC 2673.423
Sample: 01-01-2015 HQIC 2652.926
- 12-19-2016
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.4786 0.046 10.467 0.000 0.389 0.568
ar.L2 -0.6776 0.044 -15.258 0.000 -0.765 -0.591
ma.L1 -0.0163 0.036 -0.457 0.647 -0.086 0.053
ma.L2 0.8385 0.042 19.922 0.000 0.756 0.921
ar.S.L12 0.1830 0.047 3.881 0.000 0.091 0.275
ar.S.L24 -0.2525 0.060 -4.231 0.000 -0.369 -0.136
ma.S.L12 -0.8840 0.043 -20.366 0.000 -0.969 -0.799
sigma2 10.5052 0.445 23.599 0.000 9.633 11.378
===================================================================================
Ljung-Box (L1) (Q): 6.09 Jarque-Bera (JB): 1086.41
Prob(Q): 0.01 Prob(JB): 0.00
Heteroskedasticity (H): 1.67 Skew: 1.23
Prob(H) (two-sided): 0.00 Kurtosis: 9.78
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
#fitting model
sarima_model_1 = sm.tsa.statespace.SARIMAX(trainset,
order=(2, 0, 2),
seasonal_order=(2, 1, 1, 12),
enforce_stationarity=False,
enforce_invertibility=False)
sarima_fit_1 = sarima_model_1.fit()
print(sarima_fit_1.summary().tables[1])
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 1.8543 0.163 11.363 0.000 1.534 2.174
ar.L2 -0.8595 0.154 -5.593 0.000 -1.161 -0.558
ma.L1 -1.7602 0.189 -9.313 0.000 -2.131 -1.390
ma.L2 0.7314 0.134 5.450 0.000 0.468 0.994
ar.S.L12 -0.1732 0.064 -2.710 0.007 -0.298 -0.048
ar.S.L24 -0.1830 0.060 -3.075 0.002 -0.300 -0.066
ma.S.L12 -0.9945 0.389 -2.554 0.011 -1.758 -0.231
sigma2 7.5062 3.500 2.144 0.032 0.646 14.366
==============================================================================
sarima_fit_1.plot_diagnostics(figsize=(16, 8))
plt.show()
df.tail
<bound method NDFrame.tail of date
2015-01-01 37.000000
2015-01-02 25.283465
2015-01-05 23.875000
2015-01-06 17.809524
2015-01-07 17.488372
...
2017-08-09 11.565217
2017-08-10 9.215686
2017-08-11 14.331034
2017-08-14 14.000000
2017-08-15 14.613636
Freq: B, Name: unit_sales, Length: 684, dtype: float64>
# One step ahead forecast
sarima_predict_1 = sarima_fit_1.get_prediction(start=pd.to_datetime('2016-10-31'), end=pd.to_datetime('2017-08-15')
,dynamic=False)
sarima_predict_conf_1 = sarima_predict_1.conf_int()
#observed plot
ax = df.plot(label='Observed',color='#2574BF')
#predicted plot
#rcParams['figure.figsize'] = 14, 7
sarima_predict_1.predicted_mean.plot(ax=ax, label='SARIMA (2, 0, 2)x(2, 1, 1, 12) Prediction', alpha= 0.7, color='red')
ax.fill_between(sarima_predict_conf_1.index,
#lower sales
sarima_predict_conf_1.iloc[:, 0],
#upper sales
sarima_predict_conf_1.iloc[:, 1], color='k', alpha=0.1)
plt.title('Seasonal ARIMA sales forecasting')
plt.xlabel('Date')
plt.ylabel('Unit Sales')
plt.legend()
plt.show()
trainset.tail(12)
date 2016-12-02 16.772414 2016-12-05 12.980769 2016-12-06 14.620000 2016-12-07 14.142857 2016-12-08 10.568182 2016-12-09 14.479730 2016-12-12 13.255319 2016-12-13 10.604167 2016-12-14 11.212766 2016-12-15 9.562500 2016-12-16 14.902778 2016-12-19 13.843137 Freq: B, Name: unit_sales, dtype: float64
sarima_results=performance(df[-207:],sarima_predict_1.predicted_mean)
sarima_results
{'MSE': 65.42, 'RMSE': 8.09, 'MAPE': 24.35, 'RSquared': 0.01}
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
%matplotlib inline
import seaborn as sns
from statsmodels.graphics import tsaplots
import statsmodels.api as sm
from pylab import rcParams
import itertools
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from pandas import datetime
from pandas import DataFrame
from pandas import concat
from pandas import Series
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout,Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from statsmodels.tools.eval_measures import rmse
import warnings
warnings.filterwarnings("ignore")
#pip install statsmodels
!pip install sklearn
Requirement already satisfied: sklearn in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (0.0) Requirement already satisfied: scikit-learn in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from sklearn) (1.1.2) Requirement already satisfied: numpy>=1.17.3 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from scikit-learn->sklearn) (1.22.4) Requirement already satisfied: scipy>=1.3.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from scikit-learn->sklearn) (1.7.1) Requirement already satisfied: joblib>=1.0.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from scikit-learn->sklearn) (1.1.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from scikit-learn->sklearn) (3.1.0)
!pip install pandas
!pip install numpy
!pip install matplotlib
Collecting pandas
Downloading pandas-1.4.4-cp39-cp39-win_amd64.whl (10.6 MB)
--------------------------------------- 10.6/10.6 MB 65.6 MB/s eta 0:00:00
Collecting pytz>=2020.1
Downloading pytz-2022.2.1-py2.py3-none-any.whl (500 kB)
---------------------------------------- 500.6/500.6 kB ? eta 0:00:00
Requirement already satisfied: numpy>=1.18.5 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pandas) (1.23.1)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pandas) (2.8.2)
Requirement already satisfied: six>=1.5 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)
Installing collected packages: pytz, pandas
Successfully installed pandas-1.4.4 pytz-2022.2.1
Requirement already satisfied: numpy in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (1.23.1)
Collecting matplotlib
Downloading matplotlib-3.5.3-cp39-cp39-win_amd64.whl (7.2 MB)
---------------------------------------- 7.2/7.2 MB 66.0 MB/s eta 0:00:00
Requirement already satisfied: numpy>=1.17 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib) (1.23.1)
Collecting kiwisolver>=1.0.1
Using cached kiwisolver-1.4.4-cp39-cp39-win_amd64.whl (55 kB)
Collecting fonttools>=4.22.0
Downloading fonttools-4.37.1-py3-none-any.whl (957 kB)
------------------------------------- 957.2/957.2 kB 59.2 MB/s eta 0:00:00
Collecting cycler>=0.10
Using cached cycler-0.11.0-py3-none-any.whl (6.4 kB)
Requirement already satisfied: packaging>=20.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib) (21.3)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib) (2.8.2)
Collecting pillow>=6.2.0
Downloading Pillow-9.2.0-cp39-cp39-win_amd64.whl (3.3 MB)
---------------------------------------- 3.3/3.3 MB 105.4 MB/s eta 0:00:00
Requirement already satisfied: six>=1.5 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Installing collected packages: pillow, kiwisolver, fonttools, cycler, matplotlib
Successfully installed cycler-0.11.0 fonttools-4.37.1 kiwisolver-1.4.4 matplotlib-3.5.3 pillow-9.2.0
pip install seaborn
Collecting seabornNote: you may need to restart the kernel to use updated packages.
Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
------------------------------------- 292.8/292.8 kB 18.8 MB/s eta 0:00:00
Requirement already satisfied: pandas>=0.23 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from seaborn) (1.4.4)
Requirement already satisfied: scipy>=1.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from seaborn) (1.7.1)
Requirement already satisfied: matplotlib>=2.2 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from seaborn) (3.5.3)
Requirement already satisfied: numpy>=1.15 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from seaborn) (1.22.4)
Requirement already satisfied: cycler>=0.10 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (0.11.0)
Requirement already satisfied: pillow>=6.2.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (9.2.0)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (2.8.2)
Requirement already satisfied: pyparsing>=2.2.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (3.0.9)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (21.3)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from matplotlib>=2.2->seaborn) (4.37.1)
Requirement already satisfied: pytz>=2020.1 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from pandas>=0.23->seaborn) (2022.2.1)
Requirement already satisfied: six>=1.5 in c:\users\u2195687\anaconda3\envs\tensorflow\lib\site-packages (from python-dateutil>=2.7->matplotlib>=2.2->seaborn) (1.16.0)
Installing collected packages: seaborn
Successfully installed seaborn-0.11.2
pip install itertools
Note: you may need to restart the kernel to use updated packages.
ERROR: Could not find a version that satisfies the requirement itertools (from versions: none) ERROR: No matching distribution found for itertools
df_lstm = train_subset.resample('B').mean()
df1 = df_lstm['unit_sales'].squeeze()
train1, test1 = np.array(df1[:-12]), np.array(df1[-12:])
train1= train1.reshape(-1,1)
test1= test1.reshape(-1,1)
#Scale train and test data to [-1, 1]
scaler = MinMaxScaler()
scaler.fit(train1)
train1 = scaler.transform(train1)
test1 = scaler.transform(test1)
n_input = 671
# univariate
n_features = 1
#TimeseriesGenerator automatically transform a univariate time series dataset into a supervised learning problem.
generator = TimeseriesGenerator(train1, train1, length=n_input, batch_size=10)
######
#set the counter to repeat
n=2
store= np.zeros((671,n))
for i in range(n):
model_vanilla = Sequential()
model_vanilla.add(LSTM(50, activation='relu', input_shape=(671, 1)))
#Add layer
model_vanilla.add(Dense(100, activation='relu'))
model_vanilla.add(Dense(100, activation='relu'))
#Output
model_vanilla.add(Dense(1))
model_vanilla.compile(optimizer='adam', loss='mse')
# 22
model_vanilla.fit_generator(generator,epochs=10)
pred_list = []
batch = train1[-n_input:].reshape((1, n_input, n_features))
for j in range(n_input):
pred_list.append(model_vanilla.predict(batch)[0])
batch = np.append(batch[:,1:,:],[[pred_list[j]]],axis=1)
df_predict_vanilla = pd.DataFrame(scaler.inverse_transform(pred_list),
index=df1[-n_input:].index, columns=['Prediction'])
store[:,i]=df_predict_vanilla['Prediction']
print(store)
Epoch 1/10 1/1 [==============================] - 1s 1s/step - loss: 132735648.0000 Epoch 2/10 1/1 [==============================] - 0s 143ms/step - loss: 26748083240960.0000 Epoch 3/10 1/1 [==============================] - 0s 141ms/step - loss: 226.8840 Epoch 4/10 1/1 [==============================] - 0s 139ms/step - loss: 226.9358 Epoch 5/10 1/1 [==============================] - 0s 141ms/step - loss: 226.9784 Epoch 6/10 1/1 [==============================] - 0s 141ms/step - loss: 227.0147 Epoch 7/10 1/1 [==============================] - 0s 141ms/step - loss: 227.0461 Epoch 8/10 1/1 [==============================] - 0s 141ms/step - loss: 227.0737 Epoch 9/10 1/1 [==============================] - 0s 141ms/step - loss: 227.0980 Epoch 10/10 1/1 [==============================] - 0s 125ms/step - loss: 227.1197 Epoch 1/10 1/1 [==============================] - 1s 1s/step - loss: 6729826.5000 Epoch 2/10 1/1 [==============================] - 0s 141ms/step - loss: 81991856.0000 Epoch 3/10 1/1 [==============================] - 0s 141ms/step - loss: 226.9692 Epoch 4/10 1/1 [==============================] - 0s 141ms/step - loss: 227.0675 Epoch 5/10 1/1 [==============================] - 0s 141ms/step - loss: 227.1482 Epoch 6/10 1/1 [==============================] - 0s 141ms/step - loss: 227.2166 Epoch 7/10 1/1 [==============================] - 0s 141ms/step - loss: 227.2756 Epoch 8/10 1/1 [==============================] - 0s 141ms/step - loss: 227.3271 Epoch 9/10 1/1 [==============================] - 0s 141ms/step - loss: 227.3724 Epoch 10/10 1/1 [==============================] - 0s 125ms/step - loss: 227.4126 [[5.50155184 5.2992984 ] [5.50155184 5.2992984 ] [5.50155184 5.2992984 ] ... [5.50155184 5.2992984 ] [5.50155184 5.2992984 ] [5.50155184 5.2992984 ]]
final_vanilla= np.zeros((store.shape[0],1))
#final_vanilla= np.zeros((store.shape[0],1))
for i in range(store.shape[0]):
final_vanilla[i]=np.mean(store[i,:])
final_vanilla=final_vanilla.reshape((671,))
# report performance
rcParams['figure.figsize'] = 6, 4
# line plot of observed vs predicted
plt.plot(df1.index,df1,label="Observed",color='#2574BF')
plt.plot(df1[13:].index,final_vanilla,label="Vanilla LSTM Prediction")
plt.title('Vanilla LSTM sales forecasting')
plt.xlabel('Date')
plt.ylabel(' Sales')
plt.legend()
plt.show()
def performance(y_true, y_pred):
mse = ((y_pred - y_true) ** 2).mean()
mape= np.mean(np.abs((y_true - y_pred) / y_true)) * 100
r_squared = r2_score(y_true,y_pred)
performance_data= {'MSE':round(mse, 2),
'RMSE':round(np.sqrt(mse), 2),
'MAPE':round(mape, 2),
'RSquared':round(r_squared, 2)
}
return performance_data
def performance2(y_true, y_pred):
#y_true, y_pred = np.array(y_true), np.array(y_pred)
mse = ((y_pred - y_true) ** 2).mean()
mape= np.mean(np.abs((y_true - y_pred) / y_true)) * 100
return( print(' The MSE of forecasts is {}'.format(round(mse, 2))+
'\n The RMSE of forecasts is {}'.format(round(np.sqrt(mse), 2))+
'\n The MAPE of forecasts is {}'.format(round(mape, 2))))
vanilla_lstm= performance(df1[-671:],final_vanilla)
vanilla_lstm
{'MSE': 117.42, 'RMSE': 10.84, 'MAPE': 60.55, 'RSquared': -3.2}
df_input_lstm = train_subset[['unit_sales']]
df_input_lstm
| unit_sales | |
|---|---|
| date | |
| 2015-01-01 | 37.0 |
| 2015-01-02 | 27.0 |
| 2015-01-02 | 22.0 |
| 2015-01-02 | 92.0 |
| 2015-01-02 | 14.0 |
| ... | ... |
| 2017-08-15 | 26.0 |
| 2017-08-15 | 2.0 |
| 2017-08-15 | 28.0 |
| 2017-08-15 | 28.0 |
| 2017-08-15 | 8.0 |
46458 rows × 1 columns
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import tensorflow as tf
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(df_input_lstm)
features=data_scaled
target=data_scaled[:,0]
TimeseriesGenerator(features, target, length=2, sampling_rate=1, batch_size=1)[0]
(array([[[0.06239168],
[0.04506066]]]),
array([0.03639515]))
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=123, shuffle = False)
x_train.shape
(37166, 1)
win_length=2
batch_size=10
num_features=1
train_generator = TimeseriesGenerator(x_train, y_train, length=win_length, sampling_rate=1, batch_size=batch_size)
test_generator = TimeseriesGenerator(x_test, y_test, length=win_length, sampling_rate=1, batch_size=batch_size)
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(128, input_shape= (win_length, num_features), return_sequences=True))
model.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model.add(tf.keras.layers.LSTM(128, input_shape= (win_length, num_features), return_sequences=True))
model.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model.add(tf.keras.layers.LSTM(128, return_sequences=True))
model.add(tf.keras.layers.LeakyReLU(alpha=0.5))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.LSTM(64, return_sequences=False))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Dense(1))
model.summary()
Model: "sequential_29" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_48 (LSTM) (None, 2, 128) 66560 _________________________________________________________________ leaky_re_lu_12 (LeakyReLU) (None, 2, 128) 0 _________________________________________________________________ lstm_49 (LSTM) (None, 2, 128) 131584 _________________________________________________________________ leaky_re_lu_13 (LeakyReLU) (None, 2, 128) 0 _________________________________________________________________ lstm_50 (LSTM) (None, 2, 128) 131584 _________________________________________________________________ leaky_re_lu_14 (LeakyReLU) (None, 2, 128) 0 _________________________________________________________________ dropout_8 (Dropout) (None, 2, 128) 0 _________________________________________________________________ lstm_51 (LSTM) (None, 64) 49408 _________________________________________________________________ dropout_9 (Dropout) (None, 64) 0 _________________________________________________________________ dense_75 (Dense) (None, 1) 65 ================================================================= Total params: 379,201 Trainable params: 379,201 Non-trainable params: 0 _________________________________________________________________
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
patience=2,
mode='min')
model.compile(loss=tf.losses.MeanSquaredError(),
optimizer=tf.optimizers.Adam(),
metrics=[tf.metrics.MeanAbsoluteError()])
history = model.fit_generator(train_generator, epochs=10,
validation_data=test_generator,
shuffle=False,
callbacks=[early_stopping])
Epoch 1/10 3717/3717 [==============================] - 29s 6ms/step - loss: 9.5651e-04 - mean_absolute_error: 0.0210 - val_loss: 0.0022 - val_mean_absolute_error: 0.0249 Epoch 2/10 3717/3717 [==============================] - 22s 6ms/step - loss: 8.8598e-04 - mean_absolute_error: 0.0200 - val_loss: 0.0022 - val_mean_absolute_error: 0.0242 Epoch 3/10 3717/3717 [==============================] - 22s 6ms/step - loss: 8.6730e-04 - mean_absolute_error: 0.0197 - val_loss: 0.0022 - val_mean_absolute_error: 0.0242 Epoch 4/10 3717/3717 [==============================] - 22s 6ms/step - loss: 8.6178e-04 - mean_absolute_error: 0.0197 - val_loss: 0.0022 - val_mean_absolute_error: 0.0243
model.evaluate_generator(test_generator, verbose=0)
[0.0021737024653702974, 0.02430996298789978]
predictions=model.predict_generator(test_generator)
x_test[:,1:][win_length:]
array([], shape=(9290, 0), dtype=float64)
df_pred_lstm =pd.concat([pd.DataFrame(predictions), pd.DataFrame(x_test[:,1:][win_length:])],axis=1)
rev_trans=scaler.inverse_transform(df_pred_lstm)
df_final=df_input_lstm[predictions.shape[0]*-1:]
df_final['App_Pred']=rev_trans[:,0]
rcParams['figure.figsize'] = 6, 4
df_final[['unit_sales','App_Pred']].plot()
<AxesSubplot:xlabel='date'>
def rmse(x,y):
return ((df_final[x]-df_final[y])**2).mean()**0.5
def mse(x,y):
return ((df_final[x]-df_final[y]) ** 2).mean()
def mape(x,y):
return np.mean(np.abs(((df_final[x]-df_final[y])) / df_final[x])) * 100
def r_sq(x,y):
return r2_score(df_final[x],df_final[y
])
aa='unit_sales'
bb='App_Pred'
print('rmse is ',rmse(aa,bb))
print('mse is',mse(aa,bb))
print('mape is',mape(aa,bb))
print('R Squared is',r_sq(aa,bb))
rmse is 26.901469809050592 mse is 723.6890778872605 mape is 192.80352782059896 R Squared is 0.013511694519830697